Loading All necessary Libraries

library(DataExplorer)
library(plumber)
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8
library(ggplot2)
library(dplyr)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(corrplot)
## corrplot 0.92 loaded
library(MatchIt)
library(caTools)
library(glm2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(cluster)



ab_dataset_cap <- read_csv("ibmdatacleaned.csv")
## Rows: 23156 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (17): Department, EducationField, JobRole, Attrition, BusinessTravel, Ed...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeNumber, Envir...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Dataset Exploration

data.frame(ab_dataset_cap)

Variable conversion for Numeric and Categorical variables

# Conversion of Numeric Variables
numeric_vars <- c("Age", "DailyRate", "DistanceFromHome", "Education", "EmployeeNumber", 
                  "EnvironmentSatisfaction", "HourlyRate", "JobInvolvement", "JobLevel", 
                  "JobSatisfaction", "MonthlyIncome", "MonthlyRate", "NumCompaniesWorked", 
                  "PercentSalaryHike", "PerformanceRating", "RelationshipSatisfaction", 
                  "StandardHours", "StockOptionLevel", "TotalWorkingYears", 
                  "TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany", 
                  "YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager", "Index")

# Ensure that the variables actually exists in the dataframe before conversion
existing_numeric_vars <- numeric_vars[numeric_vars %in% names(ab_dataset_cap)]

# Convert existing variables to numeric
ab_dataset_cap[existing_numeric_vars] <- lapply(ab_dataset_cap[existing_numeric_vars], function(x) as.numeric(as.character(x)))

# Update: Convert categorical character variables to factors (As previously defined)
categorical_vars <- c("Department", "EducationField", "JobRole", "Attrition", "BusinessTravel",
                      "Education Desc", "EnvironmentSatisfaction Desc", "Gender", 
                      "JobInvolvement Desc", "JobSatisfaction Desc", "MaritalStatus", 
                      "OverTime", "WorkLifeBalance Desc", "Employee Source")

# Filter existing categorical variables
existing_categorical_vars <- categorical_vars[categorical_vars %in% names(ab_dataset_cap)]

ab_dataset_cap$`RelationshipSatisfaction Desc` <- as.factor(ab_dataset_cap$`RelationshipSatisfaction Desc`)


# Convert to factor
ab_dataset_cap[existing_categorical_vars] <- lapply(ab_dataset_cap[existing_categorical_vars], factor)

# Print structure to verify changes
str(ab_dataset_cap)
## spc_tbl_ [23,156 × 43] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Department                   : Factor w/ 3 levels "Human Resources",..: 1 3 1 1 1 3 3 3 3 3 ...
##  $ EducationField               : Factor w/ 7 levels "0","Human Resources",..: 2 3 4 2 4 3 3 3 3 3 ...
##  $ JobRole                      : Factor w/ 13 levels "Human Resources Associate",..: 3 11 3 3 3 11 11 11 12 12 ...
##  $ Age                          : num [1:23156] 37 41 37 37 37 41 41 41 41 41 ...
##  $ Attrition                    : Factor w/ 2 levels "Current employee",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ BusinessTravel               : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ DailyRate                    : num [1:23156] 807 1102 807 807 807 ...
##  $ DistanceFromHome             : num [1:23156] 6 1 6 6 6 1 1 1 1 1 ...
##  $ Education                    : num [1:23156] 4 2 4 4 4 2 2 2 2 2 ...
##  $ Education Desc               : Factor w/ 5 levels "Bachelor","Below College",..: 5 3 5 5 5 3 3 3 3 3 ...
##  $ EmployeeNumber               : num [1:23156] 1 1 4 5 6 7 8 9 10 11 ...
##  $ Unique Emp ID                : chr [1:23156] "Human Resources_1" "Sales_1" "Human Resources_4" "Human Resources_5" ...
##  $ Application ID               : chr [1:23156] "123457" "123456" "123459" "123460" ...
##  $ EnvironmentSatisfaction      : num [1:23156] 1 2 1 1 1 2 2 2 4 1 ...
##  $ EnvironmentSatisfaction Desc : Factor w/ 4 levels "High","Low","Medium",..: 2 3 2 2 2 3 3 3 4 2 ...
##  $ Gender                       : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
##  $ HourlyRate                   : num [1:23156] 37 94 37 37 37 94 94 94 33 41 ...
##  $ JobInvolvement               : num [1:23156] 3 3 3 3 3 3 3 3 3 3 ...
##  $ JobInvolvement Desc          : Factor w/ 4 levels "High","Low","Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ JobLevel                     : num [1:23156] 2 2 2 2 2 2 2 2 4 5 ...
##  $ JobSatisfaction              : num [1:23156] 4 4 4 4 4 4 4 4 3 1 ...
##  $ JobSatisfaction Desc         : Factor w/ 4 levels "High","Low","Medium",..: 4 4 4 4 4 4 4 4 1 2 ...
##  $ MaritalStatus                : Factor w/ 3 levels "Divorced","Married",..: 3 3 3 3 3 3 3 3 1 2 ...
##  $ MonthlyIncome                : num [1:23156] 5993 5993 5993 5993 5993 ...
##  $ MonthlyRate                  : num [1:23156] 19479 19479 19479 19479 19479 ...
##  $ NumCompaniesWorked           : num [1:23156] 8 8 5 8 5 8 4 8 2 5 ...
##  $ OverTime                     : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
##  $ PercentSalaryHike            : num [1:23156] 11 11 11 11 11 11 11 11 14 11 ...
##  $ PerformanceRating            : num [1:23156] 4 3 3 3 3 3 3 3 3 3 ...
##  $ RelationshipSatisfaction     : num [1:23156] 1 1 1 1 1 1 1 1 3 4 ...
##  $ RelationshipSatisfaction Desc: Factor w/ 4 levels "High","Low","Medium",..: 2 2 2 2 2 2 2 2 1 4 ...
##  $ StandardHours                : num [1:23156] 80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel             : num [1:23156] 0 0 0 0 0 0 0 0 3 0 ...
##  $ TotalWorkingYears            : num [1:23156] 8 8 8 8 8 8 8 8 21 33 ...
##  $ TrainingTimesLastYear        : num [1:23156] 0 0 0 0 0 0 0 0 2 5 ...
##  $ WorkLifeBalance              : num [1:23156] 1 1 1 1 1 1 1 1 3 1 ...
##  $ WorkLifeBalance Desc         : Factor w/ 4 levels "Bad","Best","Better",..: 1 1 1 1 1 1 1 1 3 1 ...
##  $ YearsAtCompany               : num [1:23156] 6 6 6 6 6 6 6 6 5 29 ...
##  $ YearsInCurrentRole           : num [1:23156] 4 4 4 4 4 4 4 4 0 8 ...
##  $ YearsSinceLastPromotion      : num [1:23156] 0 0 0 0 0 0 0 0 0 11 ...
##  $ YearsWithCurrManager         : num [1:23156] 5 5 5 5 5 5 5 5 2 10 ...
##  $ Employee Source              : Factor w/ 10 levels "Adzuna","Company Website",..: 8 8 8 8 8 8 8 8 2 4 ...
##  $ Index                        : num [1:23156] 0 1 2 3 4 5 6 7 8 9 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Department = col_character(),
##   ..   EducationField = col_character(),
##   ..   JobRole = col_character(),
##   ..   Age = col_double(),
##   ..   Attrition = col_character(),
##   ..   BusinessTravel = col_character(),
##   ..   DailyRate = col_double(),
##   ..   DistanceFromHome = col_double(),
##   ..   Education = col_double(),
##   ..   `Education Desc` = col_character(),
##   ..   EmployeeNumber = col_double(),
##   ..   `Unique Emp ID` = col_character(),
##   ..   `Application ID` = col_character(),
##   ..   EnvironmentSatisfaction = col_double(),
##   ..   `EnvironmentSatisfaction Desc` = col_character(),
##   ..   Gender = col_character(),
##   ..   HourlyRate = col_double(),
##   ..   JobInvolvement = col_double(),
##   ..   `JobInvolvement Desc` = col_character(),
##   ..   JobLevel = col_double(),
##   ..   JobSatisfaction = col_double(),
##   ..   `JobSatisfaction Desc` = col_character(),
##   ..   MaritalStatus = col_character(),
##   ..   MonthlyIncome = col_double(),
##   ..   MonthlyRate = col_double(),
##   ..   NumCompaniesWorked = col_double(),
##   ..   OverTime = col_character(),
##   ..   PercentSalaryHike = col_double(),
##   ..   PerformanceRating = col_double(),
##   ..   RelationshipSatisfaction = col_double(),
##   ..   `RelationshipSatisfaction Desc` = col_character(),
##   ..   StandardHours = col_double(),
##   ..   StockOptionLevel = col_double(),
##   ..   TotalWorkingYears = col_double(),
##   ..   TrainingTimesLastYear = col_double(),
##   ..   WorkLifeBalance = col_double(),
##   ..   `WorkLifeBalance Desc` = col_character(),
##   ..   YearsAtCompany = col_double(),
##   ..   YearsInCurrentRole = col_double(),
##   ..   YearsSinceLastPromotion = col_double(),
##   ..   YearsWithCurrManager = col_double(),
##   ..   `Employee Source` = col_character(),
##   ..   Index = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Missing Values

# Calculate the number of missing values in each column
missing_values <- sapply(ab_dataset_cap, function(x) sum(is.na(x)))
missing_values
##                    Department                EducationField 
##                             0                             0 
##                       JobRole                           Age 
##                             0                             0 
##                     Attrition                BusinessTravel 
##                             0                             0 
##                     DailyRate              DistanceFromHome 
##                             0                             0 
##                     Education                Education Desc 
##                             0                             0 
##                EmployeeNumber                 Unique Emp ID 
##                             0                             0 
##                Application ID       EnvironmentSatisfaction 
##                             0                             0 
##  EnvironmentSatisfaction Desc                        Gender 
##                             0                             0 
##                    HourlyRate                JobInvolvement 
##                             0                             0 
##           JobInvolvement Desc                      JobLevel 
##                             0                             0 
##               JobSatisfaction          JobSatisfaction Desc 
##                             0                             0 
##                 MaritalStatus                 MonthlyIncome 
##                             0                             0 
##                   MonthlyRate            NumCompaniesWorked 
##                             0                             0 
##                      OverTime             PercentSalaryHike 
##                             0                             0 
##             PerformanceRating      RelationshipSatisfaction 
##                             0                             0 
## RelationshipSatisfaction Desc                 StandardHours 
##                             0                             0 
##              StockOptionLevel             TotalWorkingYears 
##                             0                             0 
##         TrainingTimesLastYear               WorkLifeBalance 
##                             0                             0 
##          WorkLifeBalance Desc                YearsAtCompany 
##                             0                             0 
##            YearsInCurrentRole       YearsSinceLastPromotion 
##                             0                             0 
##          YearsWithCurrManager               Employee Source 
##                             0                             0 
##                         Index 
##                             0

#Summary of Missing Values

# Summary of missing (NA) values
na_summary <- ab_dataset_cap %>%
  summarise_all(~sum(is.na(.))) %>%
  mutate(Source = "NA")

# If interested in zeros as potential placeholders for missing data
zero_summary <- ab_dataset_cap %>%
  summarise_all(~sum(. == 0, na.rm = TRUE)) %>%
  mutate(Source = "Zero")

# Combine summaries
summary_combined <- bind_rows(na_summary, zero_summary)

print(summary_combined)
## # A tibble: 2 × 44
##   Department EducationField JobRole   Age Attrition BusinessTravel DailyRate
##        <int>          <int>   <int> <int>     <int>          <int>     <int>
## 1          0              0       0     0         0              0         0
## 2          0            125       0     0         0              0         0
## # ℹ 37 more variables: DistanceFromHome <int>, Education <int>,
## #   `Education Desc` <int>, EmployeeNumber <int>, `Unique Emp ID` <int>,
## #   `Application ID` <int>, EnvironmentSatisfaction <int>,
## #   `EnvironmentSatisfaction Desc` <int>, Gender <int>, HourlyRate <int>,
## #   JobInvolvement <int>, `JobInvolvement Desc` <int>, JobLevel <int>,
## #   JobSatisfaction <int>, `JobSatisfaction Desc` <int>, MaritalStatus <int>,
## #   MonthlyIncome <int>, MonthlyRate <int>, NumCompaniesWorked <int>, …

Attrition Class and Distribution

# Calculate total count for normalization
total_count_numeric <- sum(table(ab_dataset_cap$Attrition))

# Visualize the distribution of Attrition_numeric with numbers and rounded percentages
ggplot(ab_dataset_cap, aes(x = factor(Attrition), fill = factor(Attrition))) +
  geom_bar() +
  geom_text(
    aes(label = paste(..count.., " (", round(..count../total_count_numeric*100, 2), "%)", sep="")),
    stat = 'count', 
    position = position_stack(vjust = 0.5)
  ) +
  theme_minimal() +
  labs(title = "Distribution of Attrition", x = "Attrition Status", y = "Count") +
  scale_y_continuous(labels = scales::comma)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Column Names

colnames(ab_dataset_cap)
##  [1] "Department"                    "EducationField"               
##  [3] "JobRole"                       "Age"                          
##  [5] "Attrition"                     "BusinessTravel"               
##  [7] "DailyRate"                     "DistanceFromHome"             
##  [9] "Education"                     "Education Desc"               
## [11] "EmployeeNumber"                "Unique Emp ID"                
## [13] "Application ID"                "EnvironmentSatisfaction"      
## [15] "EnvironmentSatisfaction Desc"  "Gender"                       
## [17] "HourlyRate"                    "JobInvolvement"               
## [19] "JobInvolvement Desc"           "JobLevel"                     
## [21] "JobSatisfaction"               "JobSatisfaction Desc"         
## [23] "MaritalStatus"                 "MonthlyIncome"                
## [25] "MonthlyRate"                   "NumCompaniesWorked"           
## [27] "OverTime"                      "PercentSalaryHike"            
## [29] "PerformanceRating"             "RelationshipSatisfaction"     
## [31] "RelationshipSatisfaction Desc" "StandardHours"                
## [33] "StockOptionLevel"              "TotalWorkingYears"            
## [35] "TrainingTimesLastYear"         "WorkLifeBalance"              
## [37] "WorkLifeBalance Desc"          "YearsAtCompany"               
## [39] "YearsInCurrentRole"            "YearsSinceLastPromotion"      
## [41] "YearsWithCurrManager"          "Employee Source"              
## [43] "Index"

Data Profiling Report

## # A tibble: 1 × 9
##    rows columns discrete_columns continuous_columns all_missing_columns
##   <int>   <int>            <int>              <int>               <int>
## 1 23156      43               17                 26                   0
## # ℹ 4 more variables: total_missing_values <int>, complete_rows <int>,
## #   total_observations <int>, memory_usage <dbl>
## spc_tbl_ [23,156 × 43] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Department                   : Factor w/ 3 levels "Human Resources",..: 1 3 1 1 1 3 3 3 3 3 ...
##  $ EducationField               : Factor w/ 7 levels "0","Human Resources",..: 2 3 4 2 4 3 3 3 3 3 ...
##  $ JobRole                      : Factor w/ 13 levels "Human Resources Associate",..: 3 11 3 3 3 11 11 11 12 12 ...
##  $ Age                          : num [1:23156] 37 41 37 37 37 41 41 41 41 41 ...
##  $ Attrition                    : Factor w/ 2 levels "Current employee",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ BusinessTravel               : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ DailyRate                    : num [1:23156] 807 1102 807 807 807 ...
##  $ DistanceFromHome             : num [1:23156] 6 1 6 6 6 1 1 1 1 1 ...
##  $ Education                    : num [1:23156] 4 2 4 4 4 2 2 2 2 2 ...
##  $ Education Desc               : Factor w/ 5 levels "Bachelor","Below College",..: 5 3 5 5 5 3 3 3 3 3 ...
##  $ EmployeeNumber               : num [1:23156] 1 1 4 5 6 7 8 9 10 11 ...
##  $ Unique Emp ID                : chr [1:23156] "Human Resources_1" "Sales_1" "Human Resources_4" "Human Resources_5" ...
##  $ Application ID               : chr [1:23156] "123457" "123456" "123459" "123460" ...
##  $ EnvironmentSatisfaction      : num [1:23156] 1 2 1 1 1 2 2 2 4 1 ...
##  $ EnvironmentSatisfaction Desc : Factor w/ 4 levels "High","Low","Medium",..: 2 3 2 2 2 3 3 3 4 2 ...
##  $ Gender                       : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
##  $ HourlyRate                   : num [1:23156] 37 94 37 37 37 94 94 94 33 41 ...
##  $ JobInvolvement               : num [1:23156] 3 3 3 3 3 3 3 3 3 3 ...
##  $ JobInvolvement Desc          : Factor w/ 4 levels "High","Low","Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ JobLevel                     : num [1:23156] 2 2 2 2 2 2 2 2 4 5 ...
##  $ JobSatisfaction              : num [1:23156] 4 4 4 4 4 4 4 4 3 1 ...
##  $ JobSatisfaction Desc         : Factor w/ 4 levels "High","Low","Medium",..: 4 4 4 4 4 4 4 4 1 2 ...
##  $ MaritalStatus                : Factor w/ 3 levels "Divorced","Married",..: 3 3 3 3 3 3 3 3 1 2 ...
##  $ MonthlyIncome                : num [1:23156] 5993 5993 5993 5993 5993 ...
##  $ MonthlyRate                  : num [1:23156] 19479 19479 19479 19479 19479 ...
##  $ NumCompaniesWorked           : num [1:23156] 8 8 5 8 5 8 4 8 2 5 ...
##  $ OverTime                     : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
##  $ PercentSalaryHike            : num [1:23156] 11 11 11 11 11 11 11 11 14 11 ...
##  $ PerformanceRating            : num [1:23156] 4 3 3 3 3 3 3 3 3 3 ...
##  $ RelationshipSatisfaction     : num [1:23156] 1 1 1 1 1 1 1 1 3 4 ...
##  $ RelationshipSatisfaction Desc: Factor w/ 4 levels "High","Low","Medium",..: 2 2 2 2 2 2 2 2 1 4 ...
##  $ StandardHours                : num [1:23156] 80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel             : num [1:23156] 0 0 0 0 0 0 0 0 3 0 ...
##  $ TotalWorkingYears            : num [1:23156] 8 8 8 8 8 8 8 8 21 33 ...
##  $ TrainingTimesLastYear        : num [1:23156] 0 0 0 0 0 0 0 0 2 5 ...
##  $ WorkLifeBalance              : num [1:23156] 1 1 1 1 1 1 1 1 3 1 ...
##  $ WorkLifeBalance Desc         : Factor w/ 4 levels "Bad","Best","Better",..: 1 1 1 1 1 1 1 1 3 1 ...
##  $ YearsAtCompany               : num [1:23156] 6 6 6 6 6 6 6 6 5 29 ...
##  $ YearsInCurrentRole           : num [1:23156] 4 4 4 4 4 4 4 4 0 8 ...
##  $ YearsSinceLastPromotion      : num [1:23156] 0 0 0 0 0 0 0 0 0 11 ...
##  $ YearsWithCurrManager         : num [1:23156] 5 5 5 5 5 5 5 5 2 10 ...
##  $ Employee Source              : Factor w/ 10 levels "Adzuna","Company Website",..: 8 8 8 8 8 8 8 8 2 4 ...
##  $ Index                        : num [1:23156] 0 1 2 3 4 5 6 7 8 9 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Department = col_character(),
##   ..   EducationField = col_character(),
##   ..   JobRole = col_character(),
##   ..   Age = col_double(),
##   ..   Attrition = col_character(),
##   ..   BusinessTravel = col_character(),
##   ..   DailyRate = col_double(),
##   ..   DistanceFromHome = col_double(),
##   ..   Education = col_double(),
##   ..   `Education Desc` = col_character(),
##   ..   EmployeeNumber = col_double(),
##   ..   `Unique Emp ID` = col_character(),
##   ..   `Application ID` = col_character(),
##   ..   EnvironmentSatisfaction = col_double(),
##   ..   `EnvironmentSatisfaction Desc` = col_character(),
##   ..   Gender = col_character(),
##   ..   HourlyRate = col_double(),
##   ..   JobInvolvement = col_double(),
##   ..   `JobInvolvement Desc` = col_character(),
##   ..   JobLevel = col_double(),
##   ..   JobSatisfaction = col_double(),
##   ..   `JobSatisfaction Desc` = col_character(),
##   ..   MaritalStatus = col_character(),
##   ..   MonthlyIncome = col_double(),
##   ..   MonthlyRate = col_double(),
##   ..   NumCompaniesWorked = col_double(),
##   ..   OverTime = col_character(),
##   ..   PercentSalaryHike = col_double(),
##   ..   PerformanceRating = col_double(),
##   ..   RelationshipSatisfaction = col_double(),
##   ..   `RelationshipSatisfaction Desc` = col_character(),
##   ..   StandardHours = col_double(),
##   ..   StockOptionLevel = col_double(),
##   ..   TotalWorkingYears = col_double(),
##   ..   TrainingTimesLastYear = col_double(),
##   ..   WorkLifeBalance = col_double(),
##   ..   `WorkLifeBalance Desc` = col_character(),
##   ..   YearsAtCompany = col_double(),
##   ..   YearsInCurrentRole = col_double(),
##   ..   YearsSinceLastPromotion = col_double(),
##   ..   YearsWithCurrManager = col_double(),
##   ..   `Employee Source` = col_character(),
##   ..   Index = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Conversion of Variables for modelling

# Convert Attrition to a binary numeric variable
ab_dataset_cap$Attrition_numeric <- as.numeric(ab_dataset_cap$Attrition == "Voluntary Resignation")

# Calculate correlations with Attrition
correlations <- cor(ab_dataset_cap[, sapply(ab_dataset_cap, is.numeric)])
## Warning in cor(ab_dataset_cap[, sapply(ab_dataset_cap, is.numeric)]): the
## standard deviation is zero
cor_attrition <- correlations[,"Attrition_numeric"]
sorted_cor <- sort(cor_attrition, decreasing = TRUE)

# View the correlations sorted
print(sorted_cor)
##        Attrition_numeric         DistanceFromHome       NumCompaniesWorked 
##              1.000000000              0.069913505              0.040475987 
##               HourlyRate                    Index           EmployeeNumber 
##              0.012403772             -0.002001987             -0.002019309 
## RelationshipSatisfaction        PerformanceRating              MonthlyRate 
##             -0.005611355             -0.006735761             -0.008343636 
##  YearsSinceLastPromotion          WorkLifeBalance        PercentSalaryHike 
##             -0.018987128             -0.024119391             -0.024932085 
##                Education    TrainingTimesLastYear  EnvironmentSatisfaction 
##             -0.035920166             -0.045957616             -0.047779190 
##          JobSatisfaction                DailyRate           YearsAtCompany 
##             -0.053336866             -0.058597018             -0.061023246 
##     YearsWithCurrManager           JobInvolvement            MonthlyIncome 
##             -0.066725958             -0.067270649             -0.076901671 
##        TotalWorkingYears                 JobLevel       YearsInCurrentRole 
##             -0.082217359             -0.082659988             -0.085523590 
##         StockOptionLevel                      Age 
##             -0.093021550             -0.152760524

Interpretations of initial correlation matrix

DistanceFromHome (0.069913505): Positive correlation (0.07) with Attrition_numeric suggests that there is a slight tendency for employees who live farther from the workplace to have a slightly higher likelihood of attrition.

NumCompaniesWorked (0.040475987): Positive correlation (0.04) with Attrition_numeric suggests that employees who have worked at more companies before joining the current one may have a slightly higher tendency to leave.

HourlyRate (0.012403772): Very weak positive correlation (0.01) with Attrition_numeric, indicating that there’s almost no relationship between hourly rate and attrition.

Index (-0.002001987): This variable seems to have almost no correlation with Attrition_numeric or other variables, indicated by a very close to zero coefficient.

EmployeeNumber (-0.002019309): Almost no correlation with Attrition_numeric or other variables.

RelationshipSatisfaction (-0.005611355): Very weak negative correlation (-0.01) with Attrition_numeric, suggesting that employees with higher relationship satisfaction might have a slightly lower likelihood of attrition.

PerformanceRating (-0.006735761): Very weak negative correlation (-0.01) with Attrition_numeric, implying that employees with higher performance ratings might have a slightly lower likelihood of attrition.

MonthlyRate (-0.008343636): Very weak negative correlation (-0.01) with Attrition_numeric, indicating that there’s almost no relationship between monthly rate and attrition.

YearsSinceLastPromotion (-0.018987128): Weak negative correlation (-0.02) with Attrition_numeric, suggesting that employees who have been promoted more recently may have a slightly lower likelihood of attrition.

Initial variable selection

# Creating a subset by excluding specified columns
ab_dataset_reduced <- ab_dataset_cap[, !(names(ab_dataset_cap) %in% c(
  "Education Desc", 
  "EnvironmentSatisfaction Desc", 
  "JobInvolvement Desc", 
  "JobSatisfaction Desc", 
  "RelationshipSatisfaction Desc", 
  "WorkLifeBalance Desc",
  "EmployeeNumber",
  "Unique Emp ID",
  "Application ID",
  "Index",
  "StandardHours",
  "Attrition",
  "Employee Source"
))]

Correlation Matrix heatmap

# Calculate correlation matrix for numeric variables
numeric_vars <- sapply(ab_dataset_reduced, is.numeric)
cor_matrix <- cor(ab_dataset_reduced[, numeric_vars], use = "pairwise.complete.obs")

# Melt the correlation matrix into a long format
melted_cor_matrix <- melt(cor_matrix)

# Create the heatmap with ggplot2
ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(name = "Correlation", low = "blue", mid = "white", high = "red", midpoint = 0) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        axis.text.y = element_text(angle = 0),
        axis.title.x = element_blank(),
        axis.title.y = element_blank()) +
  labs(title = "Correlation Matrix Heatmap with initial selected Variables")

Re checking the variable names in reduced dataset

colnames(ab_dataset_reduced)
##  [1] "Department"               "EducationField"          
##  [3] "JobRole"                  "Age"                     
##  [5] "BusinessTravel"           "DailyRate"               
##  [7] "DistanceFromHome"         "Education"               
##  [9] "EnvironmentSatisfaction"  "Gender"                  
## [11] "HourlyRate"               "JobInvolvement"          
## [13] "JobLevel"                 "JobSatisfaction"         
## [15] "MaritalStatus"            "MonthlyIncome"           
## [17] "MonthlyRate"              "NumCompaniesWorked"      
## [19] "OverTime"                 "PercentSalaryHike"       
## [21] "PerformanceRating"        "RelationshipSatisfaction"
## [23] "StockOptionLevel"         "TotalWorkingYears"       
## [25] "TrainingTimesLastYear"    "WorkLifeBalance"         
## [27] "YearsAtCompany"           "YearsInCurrentRole"      
## [29] "YearsSinceLastPromotion"  "YearsWithCurrManager"    
## [31] "Attrition_numeric"

Attrition Class Distribution

# Create a more appealing bar plot using ggplot2
ggplot(ab_dataset_reduced, aes(x=factor(Attrition_numeric), fill=factor(Attrition_numeric))) + 
  geom_bar(stat="count", position="dodge", color="black") +
  geom_text(stat='count', aes(label=..count..), vjust=-0.5, color="black") +
  scale_fill_manual(values=c("#56B4E9", "#E69F00"), labels=c("No Attrition (0)", "Attrition (1)")) +
  scale_y_continuous(labels=scales::comma) +
  labs(title="Attrition Numeric Distribution", x="Attrition Status", y="Frequency", fill="Attrition Status") +
  theme_minimal() +
  theme(text = element_text(size=12, face="bold"), 
        plot.title = element_text(hjust = 0.5), 
        legend.title = element_blank(), 
        axis.text.x = element_text(angle = 45, hjust = 1),
        panel.grid.major = element_line(size = 0.1, linetype = 'solid', color = "gray"),
        panel.grid.minor = element_blank(),
        panel.background = element_rect(fill = "white", color = "white"),
        legend.position = "bottom")
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Data Pre-processing Steps

# Convert 'Attrition_numeric' to a factor
ab_dataset_reduced$Attrition_numeric <- as.factor(ab_dataset_reduced$Attrition_numeric)

# Check for multicollinearity
cor_matrix <- cor(ab_dataset_reduced[, sapply(ab_dataset_reduced, is.numeric)], use = "pairwise.complete.obs")

# Identify high correlations excluding self-correlations
high_cor <- which(abs(cor_matrix) > 0.7 & cor_matrix != 1, arr.ind = TRUE)

high_cor_vars <- data.frame(
  Var1 = rownames(cor_matrix)[high_cor[, 1]],
  Var2 = colnames(cor_matrix)[high_cor[, 2]],
  Correlation = cor_matrix[high_cor]
)

print(high_cor_vars)
##                     Var1                  Var2 Correlation
## 1          MonthlyIncome              JobLevel   0.9431299
## 2      TotalWorkingYears              JobLevel   0.7665359
## 3               JobLevel         MonthlyIncome   0.9431299
## 4      TotalWorkingYears         MonthlyIncome   0.7640761
## 5            MonthlyRate           MonthlyRate   1.0000000
## 6               JobLevel     TotalWorkingYears   0.7665359
## 7          MonthlyIncome     TotalWorkingYears   0.7640761
## 8  TrainingTimesLastYear TrainingTimesLastYear   1.0000000
## 9     YearsInCurrentRole        YearsAtCompany   0.7583112
## 10  YearsWithCurrManager        YearsAtCompany   0.7705981
## 11        YearsAtCompany    YearsInCurrentRole   0.7583112
## 12  YearsWithCurrManager    YearsInCurrentRole   0.7158998
## 13        YearsAtCompany  YearsWithCurrManager   0.7705981
## 14    YearsInCurrentRole  YearsWithCurrManager   0.7158998

Plotting positive correlation 1

# Positive Correlations
p1 <- ab_dataset_reduced %>% select(TotalWorkingYears, MonthlyIncome) %>%
ggplot(aes(x=TotalWorkingYears, y=MonthlyIncome)) + 
  geom_point(colour = "#6D98BA", alpha=0.5) + 
  geom_smooth(method="loess", color="#D95F02") + 
  theme_minimal() + 
  theme(legend.position="bottom", 
        plot.title=element_text(hjust=0.5), 
        plot.subtitle=element_text(hjust=0.5)) + 
  labs(title="Positive Correlation", subtitle="Monthly Income vs Working Years")



# Arrange the plots into a grid
plot_grid(p1)
## `geom_smooth()` using formula = 'y ~ x'

Attrition Numeric against years in current role

# YearsInCurrentRole vs Attrition_numeric
mean_years_in_current_role <- mean(ab_dataset_reduced$YearsInCurrentRole)
p2 <- ab_dataset_reduced %>% select(YearsInCurrentRole, Attrition_numeric) %>%
  ggplot(aes(x = YearsInCurrentRole, fill = Attrition_numeric)) +
  geom_histogram(position = "dodge", bins = 20, color = "black", alpha = 0.7) +
  geom_vline(xintercept = mean_years_in_current_role, linetype = "dashed", size = 1, color = "red", 
             aes(label = paste("Mean:", round(mean_years_in_current_role, 2)))) +
  scale_color_manual(values = c("red")) +
  theme_minimal() +
  labs(title = "Attrition by Years in Current Role", x = "Years in Current Role", y = "Count", fill = "Attrition") +
  theme(legend.position = "top")
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p2)

Training times Last year vs Attrition numeric

# TrainingTimesLastYear vs Attrition_numeric
mean_training_times_last_year <- mean(ab_dataset_reduced$TrainingTimesLastYear)
p3 <- ab_dataset_reduced %>% select(TrainingTimesLastYear, Attrition_numeric) %>%
  ggplot(aes(x = TrainingTimesLastYear, fill = Attrition_numeric)) +
  geom_histogram(position = "dodge", bins = 20, color = "black", alpha = 0.7) +
  geom_vline(xintercept = mean_training_times_last_year, linetype = "dashed", size = 1, color = "red", 
             aes(label = paste("Mean:", round(mean_training_times_last_year, 2)))) +
  scale_color_manual(values = c("red")) +
  theme_minimal() +
  labs(title = "Attrition by Training Times Last Year", x = "Training Times Last Year", y = "Count", fill = "Attrition") +
  theme(legend.position = "top")
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
print(p3)

Distance from home vs Attrition numeric

# DistanceFromHome vs Attrition_numeric
mean_distance_from_home <- mean(ab_dataset_reduced$DistanceFromHome)
p4 <- ab_dataset_reduced %>% select(DistanceFromHome, Attrition_numeric) %>%
  ggplot(aes(x = DistanceFromHome, fill = Attrition_numeric)) +
  geom_histogram(position = "dodge", bins = 20, color = "black", alpha = 0.7) +
  geom_vline(xintercept = mean_distance_from_home, linetype = "dashed", size = 1, color = "red", 
             aes(label = paste("Mean:", round(mean_distance_from_home, 2)))) +
  scale_color_manual(values = c("red")) +
  theme_minimal() +
  labs(title = "Attrition by Distance from Home", x = "Distance from Home", y = "Count", fill = "Attrition") +
  theme(legend.position = "top")
## Warning: `geom_vline()`: Ignoring `mapping` because `xintercept` was provided.
print(p4)

Calculating Mean age across different departments

mean_ages <- ab_dataset_reduced %>%
  group_by(Attrition_numeric) %>%
  summarise(
    MeanAge = mean(Age, na.rm = TRUE),
    MaxCount = max(table(Age))
  )

Calculate the mean age and the maximum count for each attrition status

# Compute mean ages and max count for each attrition status
mean_ages <- ab_dataset_reduced %>%
  group_by(Attrition_numeric) %>%
  summarise(
    MeanAge = mean(Age, na.rm = TRUE),
    MaxCount = max(table(Age))
  )

# Create the histogram
age_distribution_plot <- ab_dataset_reduced %>%
  ggplot(aes(x = Age, fill = factor(Attrition_numeric))) + 
  geom_histogram(binwidth = 1, position = 'identity', alpha = 0.6) +
  scale_fill_manual(values = c("#5DA5B3", "#26547C"), 
                    labels = c("Stayed", "Left"),
                    name = "Attrition Status") +
  geom_vline(data = mean_ages,
             aes(xintercept = MeanAge, color = factor(Attrition_numeric)),
             linetype = "dashed",
             size = 1.5) +
  geom_text(data = mean_ages, 
            aes(x = MeanAge, label = paste("Mean:", round(MeanAge, 1)), y = MaxCount, color = factor(Attrition_numeric)),
            size = 4, vjust = 1.5, fontface = "bold", color = "black") +
  scale_color_manual(values = c("#5DA5B3", "#26547C")) +
  guides(color = FALSE) +
  theme_minimal() +
  labs(title = "Age Distribution by Attrition Status",
       x = "Age",
       y = "Count") +
  theme(legend.position = "top",
        plot.title = element_text(size = 14, face = "bold"),
        axis.title = element_text(size = 12),
        legend.background = element_blank(),
        legend.key = element_blank())
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Print the interactive plot
print(age_distribution_plot)
Total Working Years vs Attrition

Total Working Years vs Attrition

Age vs Monthly Income

# Positive Correlations
p1 <- ab_dataset_reduced %>% select(Age, MonthlyIncome) %>%
  ggplot(aes(x=Age, y=MonthlyIncome)) + 
  geom_point(colour = "#6D98BA", alpha=0.5) + 
  geom_smooth(method="loess", color="#D95F02") + 
  theme_minimal() + 
  theme(legend.position="bottom", 
        plot.title=element_text(hjust=0.5), 
        plot.subtitle=element_text(hjust=0.5)) + 
  labs(title="Positive Correlation", subtitle="Monthly Income vs Age")

# Arrange the plots into a grid
plot_grid(p1)
## `geom_smooth()` using formula = 'y ~ x'

# Define the color palette
color_palette <- brewer.pal(9, "Blues")
mean_ages_dept <- ab_dataset_reduced %>%
  group_by(Department, Attrition_numeric) %>%
  summarise(
    MeanAge = mean(Age, na.rm = TRUE),
    MaxCount = max(table(Age)),
    .groups = 'drop' # This will drop the grouping so that we don't carry it into our plot
  )

Age Distribution

age_distribution_plot_dept <- ab_dataset_reduced %>%
  ggplot(aes(x = Department, fill = factor(Attrition_numeric))) + 
  geom_bar(position = 'dodge', alpha = 0.6) +
  scale_fill_manual(values = c("#6D88BA", "#D93F02"), 
                    labels = c("Stayed", "Left"),
                    name = "Attrition Status") +
  geom_text(data = mean_ages_dept, 
            aes(x = Department, y = MaxCount, 
                label = paste("Mean:", round(MeanAge, 1)), 
                color = factor(Attrition_numeric)),
            size = 4, vjust = -0.5, fontface = "bold", 
            position = position_dodge(width = 0.9)) +
  scale_color_manual(values = c("black", "black")) + # Set both colors to white
  theme_minimal() +
  labs(title = "Attrition Distribution by Department",
       x = "Department",
       y = "Count") +
  theme(legend.position = "top",
        plot.title = element_text(size = 14, face = "bold"),
        axis.title = element_text(size = 12),
        legend.text = element_text(color = "black"), # Ensure legend text is black for visibility
        legend.background = element_blank(),
        legend.key = element_blank(),
        legend.title = element_text(color = "black")) + # Ensure legend title is black for visibility
  guides(color = FALSE) # Remove the color guide for the text

# Print the plot
print(age_distribution_plot_dept)

Scaling the numerical Variables

# Scale the numeric variables in the dataset "ab_dataset_reduced"
numeric_vars <- sapply(ab_dataset_reduced, is.numeric)
ab_dataset_reduced[numeric_vars] <- scale(ab_dataset_reduced[numeric_vars])

# Ensure the dataset is in a data frame format
ab_dataset_reduced <- as.data.frame(ab_dataset_reduced)

# View the structure of the dataset to confirm changes
str(ab_dataset_reduced)
## 'data.frame':    23156 obs. of  31 variables:
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 1 3 1 1 1 3 3 3 3 3 ...
##  $ EducationField          : Factor w/ 7 levels "0","Human Resources",..: 2 3 4 2 4 3 3 3 3 3 ...
##  $ JobRole                 : Factor w/ 13 levels "Human Resources Associate",..: 3 11 3 3 3 11 11 11 12 12 ...
##  $ Age                     : num  0.00925 0.44817 0.00925 0.00925 0.00925 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ DailyRate               : num  0.012 0.744 0.012 0.012 0.012 ...
##  $ DistanceFromHome        : num  -0.394 -1.011 -0.394 -0.394 -0.394 ...
##  $ Education               : num  0.974 -1.086 0.974 0.974 0.974 ...
##  $ EnvironmentSatisfaction : num  -1.571 -0.658 -1.571 -1.571 -1.571 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
##  $ HourlyRate              : num  -1.42 1.38 -1.42 -1.42 -1.42 ...
##  $ JobInvolvement          : num  0.379 0.379 0.379 0.379 0.379 ...
##  $ JobLevel                : num  -0.0588 -0.0588 -0.0588 -0.0588 -0.0588 ...
##  $ JobSatisfaction         : num  1.16 1.16 1.16 1.16 1.16 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 3 3 3 3 3 3 3 3 1 2 ...
##  $ MonthlyIncome           : num  -0.109 -0.109 -0.109 -0.109 -0.109 ...
##  $ MonthlyRate             : num  0.73 0.73 0.73 0.73 0.73 ...
##  $ NumCompaniesWorked      : num  2.126 2.126 0.925 2.126 0.925 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
##  $ PercentSalaryHike       : num  -1.15 -1.15 -1.15 -1.15 -1.15 ...
##  $ PerformanceRating       : num  2.333 -0.429 -0.429 -0.429 -0.429 ...
##  $ RelationshipSatisfaction: num  -1.59 -1.59 -1.59 -1.59 -1.59 ...
##  $ StockOptionLevel        : num  -0.934 -0.934 -0.934 -0.934 -0.934 ...
##  $ TotalWorkingYears       : num  -0.421 -0.421 -0.421 -0.421 -0.421 ...
##  $ TrainingTimesLastYear   : num  -2.17 -2.17 -2.17 -2.17 -2.17 ...
##  $ WorkLifeBalance         : num  -2.5 -2.5 -2.5 -2.5 -2.5 ...
##  $ YearsAtCompany          : num  -0.166 -0.166 -0.166 -0.166 -0.166 ...
##  $ YearsInCurrentRole      : num  -0.0637 -0.0637 -0.0637 -0.0637 -0.0637 ...
##  $ YearsSinceLastPromotion : num  -0.679 -0.679 -0.679 -0.679 -0.679 ...
##  $ YearsWithCurrManager    : num  0.243 0.243 0.243 0.243 0.243 ...
##  $ Attrition_numeric       : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

Summary of numerical Variables

summary(ab_dataset_reduced[numeric_vars])
##       Age            DailyRate          DistanceFromHome    Education       
##  Min.   :-2.0756   Min.   :-1.7366401   Min.   :-1.0113   Min.   :-2.11567  
##  1st Qu.:-0.7589   1st Qu.:-0.8362733   1st Qu.:-0.8878   1st Qu.:-1.08579  
##  Median :-0.1005   Median :-0.0003957   Median :-0.2704   Median :-0.05591  
##  Mean   : 0.0000   Mean   : 0.0000000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.6676   3rd Qu.: 0.8801283   3rd Qu.: 0.5940   3rd Qu.: 0.97398  
##  Max.   : 2.5330   Max.   : 1.7284077   Max.   : 2.4462   Max.   : 2.00386  
##  EnvironmentSatisfaction   HourlyRate        JobInvolvement   
##  Min.   :-1.5711         Min.   :-1.763489   Min.   :-2.4383  
##  1st Qu.:-0.6576         1st Qu.:-0.878104   1st Qu.:-1.0297  
##  Median : 0.2559         Median : 0.007282   Median : 0.3789  
##  Mean   : 0.0000         Mean   : 0.000000   Mean   : 0.0000  
##  3rd Qu.: 1.1693         3rd Qu.: 0.843479   3rd Qu.: 0.3789  
##  Max.   : 1.1693         Max.   : 1.679677   Max.   : 1.7876  
##     JobLevel        JobSatisfaction   MonthlyIncome      MonthlyRate      
##  Min.   :-0.96299   Min.   :-1.5685   Min.   :-1.1681   Min.   :-1.71862  
##  1st Qu.:-0.96299   1st Qu.:-0.6605   1st Qu.:-0.7640   1st Qu.:-0.88050  
##  Median :-0.05884   Median : 0.2476   Median :-0.3337   Median :-0.01111  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.84530   3rd Qu.: 1.1556   3rd Qu.: 0.3980   3rd Qu.: 0.86589  
##  Max.   : 2.65359   Max.   : 1.1556   Max.   : 2.8666   Max.   : 1.78894  
##  NumCompaniesWorked PercentSalaryHike PerformanceRating
##  Min.   :-1.0760    Min.   :-1.1518   Min.   :-0.4286  
##  1st Qu.:-0.6758    1st Qu.:-0.8785   1st Qu.:-0.4286  
##  Median :-0.2756    Median :-0.3320   Median :-0.4286  
##  Mean   : 0.0000    Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5249    3rd Qu.: 0.7612   3rd Qu.:-0.4286  
##  Max.   : 2.5260    Max.   : 2.6741   Max.   : 2.3330  
##  RelationshipSatisfaction StockOptionLevel  TotalWorkingYears
##  Min.   :-1.5864          Min.   :-0.9336   Min.   :-1.4512  
##  1st Qu.:-0.6609          1st Qu.:-0.9336   1st Qu.:-0.6783  
##  Median : 0.2647          Median : 0.2415   Median :-0.1631  
##  Mean   : 0.0000          Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.1902          3rd Qu.: 0.2415   3rd Qu.: 0.4810  
##  Max.   : 1.1902          Max.   : 2.5918   Max.   : 3.7013  
##  TrainingTimesLastYear WorkLifeBalance   YearsAtCompany    YearsInCurrentRole
##  Min.   :-2.1699       Min.   :-2.4981   Min.   :-1.1433   Min.   :-1.1665   
##  1st Qu.:-0.6195       1st Qu.:-1.0804   1st Qu.:-0.6544   1st Qu.:-0.6151   
##  Median : 0.1557       Median : 0.3374   Median :-0.3285   Median :-0.3394   
##  Mean   : 0.0000       Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   
##  3rd Qu.: 0.1557       3rd Qu.: 0.3374   3rd Qu.: 0.4863   3rd Qu.: 0.7634   
##  Max.   : 2.4812       Max.   : 1.7552   Max.   : 5.3753   Max.   : 3.7961   
##  YearsSinceLastPromotion YearsWithCurrManager
##  Min.   :-0.6787         Min.   :-1.1565     
##  1st Qu.:-0.6787         1st Qu.:-0.5966     
##  Median :-0.3676         Median :-0.3167     
##  Mean   : 0.0000         Mean   : 0.0000     
##  3rd Qu.: 0.2546         3rd Qu.: 0.8032     
##  Max.   : 3.9875         Max.   : 3.6027

Splitting the Dataset into Training and Testing

set.seed(123)  # for reproducible random sampling

# Split the data into training and testing sets
index <- createDataPartition(y = ab_dataset_reduced$Attrition_numeric, p = 0.7, list = FALSE)
training_set <- ab_dataset_reduced[index, ]
testing_set <- ab_dataset_reduced[-index, ]

# Check the number of cases in each set
nrow(training_set)  # Should be approximately 70% of the data
## [1] 16210
nrow(testing_set)   # Should be approximately 30% of the data
## [1] 6946
# Convert Attrition_numeric to a factor with explicit levels in both the training and testing sets
training_set$Attrition_numeric <- factor(training_set$Attrition_numeric, levels = c(0, 1))
testing_set$Attrition_numeric <- factor(testing_set$Attrition_numeric, levels = c(0, 1))

Balancing the Attrition_numeric Class

# Check initial class distribution
table(training_set$Attrition_numeric)
## 
##     0     1 
## 13650  2560
# Balance the classes
training_set <- downSample(x = training_set[, -which(names(training_set) == "Attrition_numeric")],
                                    y = training_set$Attrition_numeric)

# Check the new class distribution to confirm balancing
table(training_set$Class)
## 
##    0    1 
## 2560 2560
# Renaming 'Class' back to 'Attrition_numeric'
training_set$Attrition_numeric <- training_set$Class
training_set$Class <- NULL 

Visual Representation of Attrition after balancing

# Calculate counts and percentages
attrition_counts <- table(training_set$Attrition_numeric)
attrition_df <- data.frame(Attrition_numeric = names(attrition_counts), 
                           Count = as.integer(attrition_counts))
attrition_df$Percentage <- (attrition_df$Count / sum(attrition_df$Count)) * 100

# Create the bar plot
ggplot(attrition_df, aes(x = Attrition_numeric, y = Count, fill = Attrition_numeric)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste(Count, " (", sprintf("%.1f", Percentage), "%)", sep="")), 
            vjust = -0.5, position = position_stack(vjust = 0.5)) +
  labs(title = "Distribution of Attrition Numeric", x = "Attrition Numeric Status", y = "Count") +
  theme_minimal()

names(training_set)
##  [1] "Department"               "EducationField"          
##  [3] "JobRole"                  "Age"                     
##  [5] "BusinessTravel"           "DailyRate"               
##  [7] "DistanceFromHome"         "Education"               
##  [9] "EnvironmentSatisfaction"  "Gender"                  
## [11] "HourlyRate"               "JobInvolvement"          
## [13] "JobLevel"                 "JobSatisfaction"         
## [15] "MaritalStatus"            "MonthlyIncome"           
## [17] "MonthlyRate"              "NumCompaniesWorked"      
## [19] "OverTime"                 "PercentSalaryHike"       
## [21] "PerformanceRating"        "RelationshipSatisfaction"
## [23] "StockOptionLevel"         "TotalWorkingYears"       
## [25] "TrainingTimesLastYear"    "WorkLifeBalance"         
## [27] "YearsAtCompany"           "YearsInCurrentRole"      
## [29] "YearsSinceLastPromotion"  "YearsWithCurrManager"    
## [31] "Attrition_numeric"

Fitting Logistic model 1 on training set

# Fit logistic regression model on training set
logistic_model <- glm(Attrition_numeric ~ ., data = training_set, family = binomial())

# Print out the results of the logistic regression model
summary(logistic_model)
## 
## Call:
## glm(formula = Attrition_numeric ~ ., family = binomial(), data = training_set)
## 
## Coefficients: (2 not defined because of singularities)
##                                         Estimate Std. Error z value Pr(>|z|)
## (Intercept)                            -2.224359   0.562372  -3.955 7.64e-05
## DepartmentResearch & Development       -0.067411   0.184810  -0.365 0.715292
## DepartmentSales                         0.648032   0.215177   3.012 0.002599
## EducationFieldHuman Resources           0.159301   0.613398   0.260 0.795094
## EducationFieldLife Sciences             0.390465   0.519361   0.752 0.452160
## EducationFieldMarketing                 0.358039   0.534105   0.670 0.502633
## EducationFieldMedical                   0.381208   0.518736   0.735 0.462414
## EducationFieldOther                     0.268788   0.532633   0.505 0.613812
## EducationFieldTechnical Degree          0.836985   0.529792   1.580 0.114144
## JobRoleHuman Resources Director         0.891298   0.823537   1.082 0.279128
## JobRoleHuman Resources Executive        0.051361   0.494169   0.104 0.917221
## JobRoleHuman Resources Manager        -13.803497 188.646196  -0.073 0.941670
## JobRoleHuman Resources Representative   0.104206   0.239413   0.435 0.663376
## JobRoleLaboratory Technician            0.126137   0.099904   1.263 0.206738
## JobRoleManufacturing Director           0.016855   0.142587   0.118 0.905900
## JobRoleResearch & Development Manager   0.039227   0.242293   0.162 0.871387
## JobRoleResearch Director                0.128618   0.150145   0.857 0.391654
## JobRoleResearch Scientist                     NA         NA      NA       NA
## JobRoleSales Executive                 -0.429819   0.144299  -2.979 0.002895
## JobRoleSales Manager                   -0.241953   0.336164  -0.720 0.471680
## JobRoleSales Representative                   NA         NA      NA       NA
## Age                                    -0.395401   0.032964 -11.995  < 2e-16
## BusinessTravelTravel_Frequently         1.435087   0.134116  10.700  < 2e-16
## BusinessTravelTravel_Rarely             0.745049   0.121789   6.118 9.50e-10
## DailyRate                              -0.171833   0.031455  -5.463 4.69e-08
## DistanceFromHome                        0.202683   0.034596   5.859 4.67e-09
## Education                              -0.006071   0.032201  -0.189 0.850467
## EnvironmentSatisfaction                -0.199345   0.035150  -5.671 1.42e-08
## GenderMale                              0.138165   0.065023   2.125 0.033599
## HourlyRate                              0.120516   0.035517   3.393 0.000691
## JobInvolvement                         -0.191046   0.030604  -6.242 4.31e-10
## JobLevel                               -0.077732   0.106475  -0.730 0.465359
## JobSatisfaction                        -0.131958   0.031040  -4.251 2.13e-05
## MaritalStatusMarried                    0.235618   0.088224   2.671 0.007570
## MaritalStatusSingle                     0.628525   0.110642   5.681 1.34e-08
## MonthlyIncome                          -0.152886   0.108200  -1.413 0.157656
## MonthlyRate                            -0.047991   0.031908  -1.504 0.132574
## NumCompaniesWorked                      0.025198   0.034039   0.740 0.459146
## OverTimeYes                             0.887989   0.067250  13.204  < 2e-16
## PercentSalaryHike                      -0.135431   0.034704  -3.902 9.52e-05
## PerformanceRating                       0.077937   0.035245   2.211 0.027015
## RelationshipSatisfaction               -0.059815   0.031050  -1.926 0.054050
## StockOptionLevel                       -0.088709   0.040250  -2.204 0.027530
## TotalWorkingYears                       0.139481   0.059914   2.328 0.019911
## TrainingTimesLastYear                  -0.129567   0.032672  -3.966 7.32e-05
## WorkLifeBalance                        -0.087203   0.030951  -2.817 0.004841
## YearsAtCompany                          0.113734   0.067934   1.674 0.094095
## YearsInCurrentRole                     -0.294169   0.053191  -5.530 3.19e-08
## YearsSinceLastPromotion                 0.200179   0.040968   4.886 1.03e-06
## YearsWithCurrManager                   -0.097849   0.053868  -1.816 0.069300
##                                          
## (Intercept)                           ***
## DepartmentResearch & Development         
## DepartmentSales                       ** 
## EducationFieldHuman Resources            
## EducationFieldLife Sciences              
## EducationFieldMarketing                  
## EducationFieldMedical                    
## EducationFieldOther                      
## EducationFieldTechnical Degree           
## JobRoleHuman Resources Director          
## JobRoleHuman Resources Executive         
## JobRoleHuman Resources Manager           
## JobRoleHuman Resources Representative    
## JobRoleLaboratory Technician             
## JobRoleManufacturing Director            
## JobRoleResearch & Development Manager    
## JobRoleResearch Director                 
## JobRoleResearch Scientist                
## JobRoleSales Executive                ** 
## JobRoleSales Manager                     
## JobRoleSales Representative              
## Age                                   ***
## BusinessTravelTravel_Frequently       ***
## BusinessTravelTravel_Rarely           ***
## DailyRate                             ***
## DistanceFromHome                      ***
## Education                                
## EnvironmentSatisfaction               ***
## GenderMale                            *  
## HourlyRate                            ***
## JobInvolvement                        ***
## JobLevel                                 
## JobSatisfaction                       ***
## MaritalStatusMarried                  ** 
## MaritalStatusSingle                   ***
## MonthlyIncome                            
## MonthlyRate                              
## NumCompaniesWorked                       
## OverTimeYes                           ***
## PercentSalaryHike                     ***
## PerformanceRating                     *  
## RelationshipSatisfaction              .  
## StockOptionLevel                      *  
## TotalWorkingYears                     *  
## TrainingTimesLastYear                 ***
## WorkLifeBalance                       ** 
## YearsAtCompany                        .  
## YearsInCurrentRole                    ***
## YearsSinceLastPromotion               ***
## YearsWithCurrManager                  .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7097.8  on 5119  degrees of freedom
## Residual deviance: 6056.2  on 5072  degrees of freedom
## AIC: 6152.2
## 
## Number of Fisher Scoring iterations: 13

Logistic model 1 on test set & confusion matrix

# Predicting on the testing set
testing_set_predictions <- predict(logistic_model, newdata = testing_set, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
# Converting probabilities to binary class predictions based on a threshold (usually 0.5)
predicted_classes <- ifelse(testing_set_predictions > 0.5, 1, 0)

# Create a confusion matrix
conf_mat <- confusionMatrix(as.factor(predicted_classes), as.factor(testing_set$Attrition_numeric))

# Print the confusion matrix along with statistics
print(conf_mat)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3888  321
##          1 1961  776
##                                           
##                Accuracy : 0.6715          
##                  95% CI : (0.6603, 0.6825)
##     No Information Rate : 0.8421          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2315          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6647          
##             Specificity : 0.7074          
##          Pos Pred Value : 0.9237          
##          Neg Pred Value : 0.2835          
##              Prevalence : 0.8421          
##          Detection Rate : 0.5597          
##    Detection Prevalence : 0.6060          
##       Balanced Accuracy : 0.6861          
##                                           
##        'Positive' Class : 0               
## 

Interpretation for Logistic model 1

Reference: 0 represents the negative class (e.g., employees who do not leave the company). 1 represents the positive class (e.g., employees who leave the company).

Prediction: 0 represents the model’s prediction of the negative class. 1 represents the model’s prediction of the positive class.

True Positives (TP): 3888 The model correctly predicted that 3888 instances belong to the positive class (employees who leave) and indeed they belong to the positive class.

False Positives (FP): 321 The model incorrectly predicted that 321 instances belong to the positive class when they actually belong to the negative class (Type I error).

True Negatives (TN): 776 The model correctly predicted that 776 instances belong to the negative class (employees who don’t leave) and indeed they belong to the negative class.

False Negatives (FN): 1961 The model incorrectly predicted that 1961 instances belong to the negative class when they actually belong to the positive class (Type II error).

Accuracy: 0.6715

Overall proportion of correct predictions made by the model.

Precision (Positive Predictive Value): 0.9237 Proportion of instances predicted as positive by the model that are actually positive.

Recall (Sensitivity): 0.6647 Proportion of actual positive instances that were correctly predicted by the model.

Specificity: 0.7074 Proportion of actual negative instances that were correctly predicted by the model.

F1 Score: Harmonic mean of precision and recall, providing a balance between the two metrics.

Kappa: 0.2315 A statistic that measures inter-rater agreement for categorical items.

Detection Rate: 0.5597 Rate of correct predictions among all actual positive instances.

Balanced Accuracy: 0.6861 Average of sensitivity and specificity, useful for imbalanced datasets.

AUC for Logistic Model

# Calculate AUC
roc_result <- roc(response = testing_set$Attrition_numeric, predictor = as.numeric(testing_set_predictions))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_value <- auc(roc_result)

# Print AUC
print(auc_value)
## Area under the curve: 0.7416

AUC plot

# Calculate ROC curve
roc_result <- roc(response = testing_set$Attrition_numeric, predictor = as.numeric(testing_set_predictions))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Calculate AUC
auc_value <- auc(roc_result)

# Create a data frame from the roc object
roc_data <- data.frame(
  TPR = roc_result$sensitivities,    # True Positive Rate (Sensitivity)
  FPR = roc_result$specificities,    # False Positive Rate (1 - Specificity)
  thresholds = roc_result$thresholds
)


# Plot ROC curve using ggplot2
ggroc_plot <- ggplot(data = roc_data, aes(x = 1 - FPR, y = TPR)) +
  geom_line(color = "blue", size = 1.5) +
  geom_ribbon(aes(ymin = 0, ymax = TPR), fill = "blue", alpha = 0.2) +  # Adds shaded area under curve
  geom_abline(linetype = "dashed", color = "red") +  # Adds the reference line
  ggtitle("ROC Curve with AUC") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),  # Centers the title
        axis.title = element_text(size = 12),    # Adjust the axis titles size
        axis.text = element_text(size = 12)) +   # Adjust the axis text size
  xlab("False Positive Rate (1 - Specificity)") +
  ylab("True Positive Rate (Sensitivity)") +
  annotate("text", x = .5, y = .2, label = paste("AUC =", round(auc_value, 2)), 
           size = 5, color = "black", hjust = 0.5)

# Print the plot
print(ggroc_plot)

Correlation plot after logistic model

numeric_data <- training_set[, sapply(training_set, is.numeric)]

# Calculate the correlation matrix
cor_matrix <- cor(numeric_data)

# Identify highly correlated pairs
highly_correlated_pairs <- findCorrelation(cor_matrix, cutoff = 0.7, verbose = TRUE)
## Compare row 20  and column  23 with corr  0.77 
##   Means:  0.208 vs 0.071 so flagging column 20 
## Compare row 17  and column  8 with corr  0.755 
##   Means:  0.175 vs 0.059 so flagging column 17 
## Compare row 8  and column  10 with corr  0.942 
##   Means:  0.136 vs 0.048 so flagging column 8 
## Compare row 23  and column  21 with corr  0.721 
##   Means:  0.116 vs 0.04 so flagging column 23 
## All correlations <= 0.7
# Print out highly correlated pairs
print(highly_correlated_pairs)
## [1] 20 17  8 23
# Visualize the correlation matrix
corrplot(cor_matrix, method = "circle", type = "upper", order = "hclust",
         tl.col = "black", tl.srt = 45, tl.cex = 0.6, cl.cex = 0.6)

LASSO Regression for Variable significance

# Create a matrix of predictors and a response variable vector
x <- model.matrix(Attrition_numeric ~ . - 1, data = training_set)  # -1 to exclude intercept
y <- training_set$Attrition_numeric
set.seed(123)  # For reproducible results
cv_model <- cv.glmnet(x, y, alpha = 1, family = "binomial")  # alpha = 1 for LASSO
best_lambda <- cv_model$lambda.min
final_model <- glmnet(x, y, alpha = 1, lambda = best_lambda, family = "binomial")
# Print non-zero coefficients
print(coef(final_model))
## 51 x 1 sparse Matrix of class "dgCMatrix"
##                                                 s0
## (Intercept)                           -1.666317338
## DepartmentHuman Resources              .          
## DepartmentResearch & Development       .          
## DepartmentSales                        0.166045312
## EducationFieldHuman Resources         -0.144733398
## EducationFieldLife Sciences            0.006040185
## EducationFieldMarketing                .          
## EducationFieldMedical                  .          
## EducationFieldOther                   -0.070960331
## EducationFieldTechnical Degree         0.434409154
## JobRoleHuman Resources Director        0.575760029
## JobRoleHuman Resources Executive       .          
## JobRoleHuman Resources Manager        -3.278457019
## JobRoleHuman Resources Representative  0.027611163
## JobRoleLaboratory Technician           0.037992639
## JobRoleManufacturing Director         -0.078563562
## JobRoleResearch & Development Manager -0.111320037
## JobRoleResearch Director               .          
## JobRoleResearch Scientist             -0.060263435
## JobRoleSales Executive                 .          
## JobRoleSales Manager                   0.130773754
## JobRoleSales Representative            0.433661163
## Age                                   -0.380537835
## BusinessTravelTravel_Frequently        1.335675624
## BusinessTravelTravel_Rarely            0.660192684
## DailyRate                             -0.163884846
## DistanceFromHome                       0.191044681
## Education                             -0.003796077
## EnvironmentSatisfaction               -0.187248775
## GenderMale                             0.118086116
## HourlyRate                             0.107827665
## JobInvolvement                        -0.183731252
## JobLevel                              -0.057569006
## JobSatisfaction                       -0.122963324
## MaritalStatusMarried                   0.191036571
## MaritalStatusSingle                    0.581002400
## MonthlyIncome                         -0.123923688
## MonthlyRate                           -0.043174994
## NumCompaniesWorked                     0.022229926
## OverTimeYes                            0.864794478
## PercentSalaryHike                     -0.125541833
## PerformanceRating                      0.065587919
## RelationshipSatisfaction              -0.051891822
## StockOptionLevel                      -0.087882185
## TotalWorkingYears                      0.106687904
## TrainingTimesLastYear                 -0.119576273
## WorkLifeBalance                       -0.080160279
## YearsAtCompany                         0.077442271
## YearsInCurrentRole                    -0.269407729
## YearsSinceLastPromotion                0.188414486
## YearsWithCurrManager                  -0.071073557

Interpretation of coefficients (lasso)

(Intercept): The model intercept or bias term is approximately -1.666, which is the log-odds of the outcome when all predictor variables are zero.

Coefficients Close to Zero (.): These are variables for which LASSO has effectively shrunk the coefficient to zero, suggesting that they do not contribute significantly to the model. The dot (.) is a placeholder indicating no effect after the penalty has been applied.

Positive Coefficients: Variables such as DepartmentSales (0.166) and BusinessTravelTravel_Frequently (1.336) have positive coefficients, indicating that higher values of these variables are associated with an increased log-odds of the outcome (possibly higher attrition, if Attrition_numeric = 1 represents attrition). For instance, employees in the sales department or those who travel frequently are more likely to experience the event of interest (which might be attrition in this context).

Negative Coefficients: Variables like JobRoleHuman Resources Manager (-3.278) have negative coefficients, indicating that higher values of these variables are associated with a decreased log-odds of the outcome. This might suggest that being in a human resources manager role is associated with a lower likelihood of attrition.

Large Positive Coefficients: Some coefficients like OverTimeYes (0.865) are substantially positive, which would suggest a strong association with the outcome. Employees who work overtime might have a much higher likelihood of attrition. Significant Predictors After Shrinkage: The LASSO method penalizes the absolute size of the coefficients and can drive some to zero, effectively selecting variables by removing those with negligible effects. The remaining non-zero coefficients are the variables LASSO has selected as having a more significant relationship with the outcome.

Removing variables that do not contribute significantly

# Define all variables to be removed in one list
variables_to_remove <- c("TotalWorkingYears", "YearsWithCurrManager", "YearsInCurrentRole",
  "MonthlyRate", "HourlyRate", "DailyRate", "JobLevel", "JobRole")

# Removing the specified variables from the dataset
ab_dataset_reduced <- ab_dataset_reduced[, !(names(ab_dataset_reduced) %in% variables_to_remove)]

# Confirm the removal by printing the remaining variable names
print(names(ab_dataset_reduced))
##  [1] "Department"               "EducationField"          
##  [3] "Age"                      "BusinessTravel"          
##  [5] "DistanceFromHome"         "Education"               
##  [7] "EnvironmentSatisfaction"  "Gender"                  
##  [9] "JobInvolvement"           "JobSatisfaction"         
## [11] "MaritalStatus"            "MonthlyIncome"           
## [13] "NumCompaniesWorked"       "OverTime"                
## [15] "PercentSalaryHike"        "PerformanceRating"       
## [17] "RelationshipSatisfaction" "StockOptionLevel"        
## [19] "TrainingTimesLastYear"    "WorkLifeBalance"         
## [21] "YearsAtCompany"           "YearsSinceLastPromotion" 
## [23] "Attrition_numeric"

Logistic Model 2 - with selected variables

set.seed(123) # for reproducibility

# Create a partition to split the dataset into training and testing sets
split <- createDataPartition(ab_dataset_reduced$Attrition_numeric, p = 0.7, list = FALSE)

# Create the training and testing datasets
trainingdata_reduced_ab <- ab_dataset_reduced[split, ]
testingdata_reduced_ab <- ab_dataset_reduced[-split, ]



# Balance the Attrition_numeric class in the training set
set.seed(123) # for reproducibility

trainingdata_balanced_ab <- upSample(x = trainingdata_reduced_ab[, -which(names(trainingdata_reduced_ab) == "Attrition_numeric")],
                                     y = trainingdata_reduced_ab$Attrition_numeric)

# Check the balance of the updated training set
table(trainingdata_balanced_ab$Class)
## 
##     0     1 
## 13650 13650

Fitting the logistic model 2

# Fit the logistic regression model on the balanced training set
reduced_logistic_model_ab <- glm(Class ~ ., data = trainingdata_balanced_ab, family = binomial())

# Rename 'Class' back to 'Attrition_numeric'
names(trainingdata_balanced_ab)[names(trainingdata_balanced_ab) == "Class"] <- "Attrition_numeric"

# Fit the logistic regression model using 'Attrition_numeric' as the dependent variable
reduced_logistic_model_ab_updated <- glm(Attrition_numeric ~ ., data = trainingdata_balanced_ab, family = binomial())

# Print out the summary of the updated logistic regression model
summary(reduced_logistic_model_ab_updated)
## 
## Call:
## glm(formula = Attrition_numeric ~ ., family = binomial(), data = trainingdata_balanced_ab)
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -1.809264   0.225487  -8.024 1.03e-15 ***
## DepartmentResearch & Development -0.012866   0.051673  -0.249  0.80337    
## DepartmentSales                   0.391511   0.059179   6.616 3.70e-11 ***
## EducationFieldHuman Resources     0.037908   0.252895   0.150  0.88085    
## EducationFieldLife Sciences       0.013449   0.213519   0.063  0.94978    
## EducationFieldMarketing           0.006694   0.219828   0.030  0.97571    
## EducationFieldMedical             0.082194   0.213063   0.386  0.69966    
## EducationFieldOther               0.119205   0.218913   0.545  0.58608    
## EducationFieldTechnical Degree    0.432195   0.217422   1.988  0.04683 *  
## Age                              -0.394975   0.013692 -28.847  < 2e-16 ***
## BusinessTravelTravel_Frequently   1.378974   0.056422  24.440  < 2e-16 ***
## BusinessTravelTravel_Rarely       0.729983   0.051303  14.229  < 2e-16 ***
## DistanceFromHome                  0.160582   0.014489  11.083  < 2e-16 ***
## Education                        -0.053962   0.013532  -3.988 6.67e-05 ***
## EnvironmentSatisfaction          -0.152135   0.013681 -11.120  < 2e-16 ***
## GenderMale                        0.163060   0.027498   5.930 3.03e-09 ***
## JobInvolvement                   -0.197086   0.013126 -15.015  < 2e-16 ***
## JobSatisfaction                  -0.159256   0.013305 -11.970  < 2e-16 ***
## MaritalStatusMarried              0.177879   0.037270   4.773 1.82e-06 ***
## MaritalStatusSingle               0.550131   0.046698  11.781  < 2e-16 ***
## MonthlyIncome                    -0.126016   0.017230  -7.314 2.60e-13 ***
## NumCompaniesWorked                0.063519   0.013849   4.587 4.51e-06 ***
## OverTimeYes                       0.912477   0.028540  31.972  < 2e-16 ***
## PercentSalaryHike                -0.105969   0.014667  -7.225 5.00e-13 ***
## PerformanceRating                 0.010503   0.014600   0.719  0.47190    
## RelationshipSatisfaction         -0.034949   0.013246  -2.639  0.00833 ** 
## StockOptionLevel                 -0.103958   0.017126  -6.070 1.28e-09 ***
## TrainingTimesLastYear            -0.139011   0.013765 -10.099  < 2e-16 ***
## WorkLifeBalance                  -0.093841   0.013063  -7.184 6.79e-13 ***
## YearsAtCompany                   -0.050584   0.019401  -2.607  0.00913 ** 
## YearsSinceLastPromotion           0.097107   0.017197   5.647 1.64e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37846  on 27299  degrees of freedom
## Residual deviance: 32963  on 27269  degrees of freedom
## AIC: 33025
## 
## Number of Fisher Scoring iterations: 4
# Calculate Odds Ratios
exp(coef(reduced_logistic_model_ab_updated))
##                      (Intercept) DepartmentResearch & Development 
##                        0.1637746                        0.9872168 
##                  DepartmentSales    EducationFieldHuman Resources 
##                        1.4792136                        1.0386355 
##      EducationFieldLife Sciences          EducationFieldMarketing 
##                        1.0135402                        1.0067160 
##            EducationFieldMedical              EducationFieldOther 
##                        1.0856669                        1.1266007 
##   EducationFieldTechnical Degree                              Age 
##                        1.5406359                        0.6736967 
##  BusinessTravelTravel_Frequently      BusinessTravelTravel_Rarely 
##                        3.9708247                        2.0750448 
##                 DistanceFromHome                        Education 
##                        1.1741939                        0.9474680 
##          EnvironmentSatisfaction                       GenderMale 
##                        0.8588725                        1.1771070 
##                   JobInvolvement                  JobSatisfaction 
##                        0.8211201                        0.8527779 
##             MaritalStatusMarried              MaritalStatusSingle 
##                        1.1946807                        1.7334793 
##                    MonthlyIncome               NumCompaniesWorked 
##                        0.8816012                        1.0655796 
##                      OverTimeYes                PercentSalaryHike 
##                        2.4904832                        0.8994529 
##                PerformanceRating         RelationshipSatisfaction 
##                        1.0105587                        0.9656546 
##                 StockOptionLevel            TrainingTimesLastYear 
##                        0.9012627                        0.8702184 
##                  WorkLifeBalance                   YearsAtCompany 
##                        0.9104273                        0.9506743 
##          YearsSinceLastPromotion 
##                        1.1019788

Odds ratio interpretation

(Intercept): The odds ratio for the intercept is 0.1637746. Since the intercept represents the log odds of the outcome when all predictors are held at zero, this value isn’t often used for interpretation in the context of the predictors.

DepartmentResearch & Development: The odds of the outcome (likely an event such as employee attrition, based on the context of the variable names) for employees in the Research & Development department are 0.9872168 times those of the baseline department (not shown here, but typically the one excluded from the model), holding all other variables constant. This value is very close to 1, indicating almost no change in odds compared to the baseline.

DepartmentSales: Employees in the Sales department have 1.4792136 times the odds of the outcome compared to the baseline department, holding all other variables constant. This suggests a 47.9% increase in the odds of the outcome occurring. Age: For each one-unit increase in age, the odds of the outcome decrease by a factor of 0.6736967 (or 32.6% decrease in the odds), holding other variables constant.

BusinessTravelTravel_Frequently: Employees who travel frequently have 3.9708247 times the odds of the outcome compared to those who do not travel as part of their job (likely the reference category), all else being equal. This is a substantial increase in odds.

MonthlyIncome: For each unit increase in monthly income (the scale isn’t specified here, so it could be dollars, thousands of dollars, etc.), the odds of the outcome are multiplied by 0.8816012, indicating a decrease in the odds of the outcome as monthly income increases.

OverTimeYes: Employees who work overtime have 2.4904832 times the odds of the outcome happening compared to those who do not work overtime, controlling for other factors.

MaritalStatusSingle: Employees who are single have 1.7334793 times the odds of the outcome compared to the baseline marital status (likely married or divorced, which is not listed).

# Make predictions on the testing set using the logistic regression model
testingdata_predictions_ab <- predict(reduced_logistic_model_ab_updated, newdata = testingdata_reduced_ab, type = "response")

# Convert predictions to a binary outcome based on a threshold (e.g., 0.5)
predicted_class_ab <- ifelse(testingdata_predictions_ab > 0.5, 1, 0)

# Actual values of Attrition_numeric from the testing set
actual_class_ab <- testingdata_reduced_ab$Attrition_numeric

# Confusion Matrix to see how well the model did on the testing set
confusionMatrix(data = as.factor(predicted_class_ab), reference = as.factor(actual_class_ab))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3886  347
##          1 1963  750
##                                           
##                Accuracy : 0.6674          
##                  95% CI : (0.6562, 0.6785)
##     No Information Rate : 0.8421          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2178          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6644          
##             Specificity : 0.6837          
##          Pos Pred Value : 0.9180          
##          Neg Pred Value : 0.2764          
##              Prevalence : 0.8421          
##          Detection Rate : 0.5595          
##    Detection Prevalence : 0.6094          
##       Balanced Accuracy : 0.6740          
##                                           
##        'Positive' Class : 0               
## 

ROC Curve for Logistic model 2

# Generate the ROC object from actual and predicted classes
roc_object <- roc(response = actual_class_ab, predictor = testingdata_predictions_ab)

# Create a data frame from the roc_object for ggplot
roc_data <- data.frame(
  True_Positive_Rate = roc_object$sensitivities,
  False_Positive_Rate = roc_object$specificities,
  Threshold = roc_object$thresholds
)

# Calculate AUC
auc_value <- auc(roc_object)

# ROC Curve plot with ggplot2
roc_curve_plot <- ggplot(roc_data, aes(x = 1 - False_Positive_Rate, y = True_Positive_Rate)) +
  geom_line(color = "#1c61b6", size = 1) +
  geom_segment(aes(x = 0, y = 0, xend = 1, yend = 1), color = "red", linetype = "dotted") +
  geom_text(aes(x = 0.8, y = 0.2, label = paste("AUC =", round(auc_value, 3))), hjust = 0, vjust = 0, color = "red") +
  labs(title = "ROC Curve with AUC", x = "False Positive Rate (1 - Specificity)", y = "True Positive Rate (Sensitivity)") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),
        text = element_text(size = 12, face = "bold"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_rect(fill = "white", color = "white"),
        legend.position = "none")

# Print the ROC Curve plot
print(roc_curve_plot)

Department wise (Human Resources, Sales and Research and Development) Logistic Regressions

# Split the subset data into three separate data frames, one for each department
hr_subset <- subset(ab_dataset_reduced, Department == "Human Resources")
sales_subset <- subset(ab_dataset_reduced, Department == "Sales")
rd_subset <- subset(ab_dataset_reduced, Department == "Research & Development")

# Function to check factor levels in each subset, revised to exclude non-factors and return only factors
check_factor_levels <- function(data) {
  lapply(data[, sapply(data, is.factor)], function(x) length(levels(x)))
}

# Apply to each department
hr_levels <- check_factor_levels(hr_subset)
sales_levels <- check_factor_levels(sales_subset)
rd_levels <- check_factor_levels(rd_subset)

Sales Subset

# Subset data for the Sales department from main dataset
sales_subset <- subset(ab_dataset_reduced, Department == "Sales")

# Remove the 'Department' column since it's no longer needed after filtering for Sales
sales_subset$Department <- NULL

# Remove single-level factors which can cause issues in modeling
sales_subset <- sales_subset[, sapply(sales_subset, function(x) !is.factor(x) || length(levels(x)) > 1)]

# Split data into training and testing sets
set.seed(123) # For reproducibility
index <- sample(1:nrow(sales_subset), round(0.75 * nrow(sales_subset)))
train_data <- sales_subset[index, ]
test_data_sales <- sales_subset[-index, ]

# Balancing the training dataset by downsampling the majority class
table_train <- table(train_data$Attrition_numeric)
min_class_size <- min(table_train)

# Subsetting each class to separate the majority and minority classes
train_data_0 <- train_data[train_data$Attrition_numeric == 0, ]
train_data_1 <- train_data[train_data$Attrition_numeric == 1, ]

# Sampling from the majority class to balance with the minority class
train_data_0_downsampled <- train_data_0[sample(1:nrow(train_data_0), min_class_size), ]
train_data_balanced <- rbind(train_data_0_downsampled, train_data_1)

# Shuffle rows to mix classes well, ensuring the model isn't biased towards the original order
set.seed(123)
train_data_balanced <- train_data_balanced[sample(nrow(train_data_balanced)), ]

# Fit logistic regression model
tryCatch({
  sales_model <- glm(Attrition_numeric ~ ., data = train_data_balanced, family = binomial())
  print(summary(sales_model))
}, error = function(e) {
  print("Error fitting model:")
  print(e$message)
})
## 
## Call:
## glm(formula = Attrition_numeric ~ ., family = binomial(), data = train_data_balanced)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -1.910016   0.278052  -6.869 6.45e-12 ***
## EducationFieldMarketing          0.137648   0.126109   1.091 0.275055    
## EducationFieldOther              0.286175   0.269705   1.061 0.288659    
## Age                             -0.202272   0.059618  -3.393 0.000692 ***
## BusinessTravelTravel_Frequently  1.623971   0.245235   6.622 3.54e-11 ***
## BusinessTravelTravel_Rarely      0.790824   0.224385   3.524 0.000424 ***
## DistanceFromHome                 0.378346   0.058956   6.417 1.39e-10 ***
## Education                       -0.187577   0.056760  -3.305 0.000951 ***
## EnvironmentSatisfaction         -0.193451   0.059834  -3.233 0.001224 ** 
## GenderMale                       0.096305   0.115787   0.832 0.405551    
## JobInvolvement                  -0.292967   0.055331  -5.295 1.19e-07 ***
## JobSatisfaction                 -0.169771   0.055551  -3.056 0.002242 ** 
## MaritalStatusMarried            -0.178187   0.161806  -1.101 0.270795    
## MaritalStatusSingle              0.775332   0.202421   3.830 0.000128 ***
## MonthlyIncome                   -0.114129   0.083884  -1.361 0.173651    
## NumCompaniesWorked               0.091511   0.050069   1.828 0.067596 .  
## OverTimeYes                      1.125056   0.123449   9.114  < 2e-16 ***
## PercentSalaryHike               -0.280459   0.064031  -4.380 1.19e-05 ***
## PerformanceRating                0.002623   0.066168   0.040 0.968381    
## RelationshipSatisfaction        -0.109068   0.054675  -1.995 0.046060 *  
## StockOptionLevel                 0.136253   0.075686   1.800 0.071824 .  
## TrainingTimesLastYear           -0.021633   0.056995  -0.380 0.704272    
## WorkLifeBalance                 -0.127593   0.058292  -2.189 0.028608 *  
## YearsAtCompany                  -0.205858   0.084336  -2.441 0.014649 *  
## YearsSinceLastPromotion          0.370418   0.070870   5.227 1.73e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2298.5  on 1657  degrees of freedom
## Residual deviance: 1920.6  on 1633  degrees of freedom
## AIC: 1970.6
## 
## Number of Fisher Scoring iterations: 4
# Predict and evaluate with confusion matrix
pred_prob <- predict(sales_model, test_data_sales, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
confusion_matrix <- table(Predicted = pred_class, Actual = test_data_sales$Attrition_numeric)
print(confusion_matrix)
##          Actual
## Predicted   0   1
##         0 791 106
##         1 306 185
# Calculate and print accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.703170028818444"
# Check for NAs or unusual values in the test dataset
summary(test_data_sales)
##           EducationField      Age                     BusinessTravel
##  0               :  0    Min.   :-2.07560   Non-Travel       :149   
##  Human Resources :  0    1st Qu.:-0.75885   Travel_Frequently:275   
##  Life Sciences   :894    Median :-0.21021   Travel_Rarely    :964   
##  Marketing       :415    Mean   :-0.04735                           
##  Medical         :  0    3rd Qu.: 0.55789                           
##  Other           : 79    Max.   : 2.53301                           
##  Technical Degree:  0                                               
##  DistanceFromHome     Education        EnvironmentSatisfaction    Gender   
##  Min.   :-1.01127   Min.   :-2.11567   Min.   :-1.57109        Female:587  
##  1st Qu.:-0.88779   1st Qu.:-1.08579   1st Qu.:-0.65761        Male  :801  
##  Median :-0.14691   Median :-0.05591   Median : 0.25586                    
##  Mean   : 0.05352   Mean   :-0.07149   Mean   : 0.02815                    
##  3rd Qu.: 0.59397   3rd Qu.: 0.97398   3rd Qu.: 1.16934                    
##  Max.   : 2.44617   Max.   : 2.00386   Max.   : 1.16934                    
##                                                                            
##  JobInvolvement     JobSatisfaction     MaritalStatus MonthlyIncome     
##  Min.   :-2.43834   Min.   :-1.56854   Divorced:282   Min.   :-1.15914  
##  1st Qu.:-1.02971   1st Qu.:-0.66049   Married :635   1st Qu.:-0.49243  
##  Median : 0.37892   Median : 0.24756   Single  :471   Median :-0.18585  
##  Mean   :-0.01484   Mean   : 0.02055                  Mean   : 0.04991  
##  3rd Qu.: 0.37892   3rd Qu.: 1.15561                  3rd Qu.: 0.45068  
##  Max.   : 1.78756   Max.   : 1.15561                  Max.   : 2.83429  
##                                                                         
##  NumCompaniesWorked OverTime  PercentSalaryHike  PerformanceRating 
##  Min.   :-1.0760    No :977   Min.   :-1.15182   Min.   :-0.42862  
##  1st Qu.:-0.6758    Yes:411   1st Qu.:-0.87854   1st Qu.:-0.42862  
##  Median : 0.5249              Median :-0.33197   Median :-0.42862  
##  Mean   : 0.4534              Mean   :-0.03408   Mean   :-0.03468  
##  3rd Qu.: 1.3253              3rd Qu.: 0.76116   3rd Qu.:-0.42862  
##  Max.   : 2.5260              Max.   : 2.67414   Max.   : 2.33296  
##                                                                    
##  RelationshipSatisfaction StockOptionLevel    TrainingTimesLastYear
##  Min.   :-1.58641         Min.   :-0.933639   Min.   :-2.16986     
##  1st Qu.:-0.66086         1st Qu.:-0.933639   1st Qu.:-0.61951     
##  Median : 0.26468         Median : 0.241517   Median : 0.15566     
##  Mean   :-0.01872         Mean   :-0.004013   Mean   : 0.03782     
##  3rd Qu.: 1.19023         3rd Qu.: 0.241517   3rd Qu.: 0.15566     
##  Max.   : 1.19023         Max.   : 2.591829   Max.   : 2.48119     
##                                                                    
##  WorkLifeBalance    YearsAtCompany     YearsSinceLastPromotion
##  Min.   :-2.49813   Min.   :-1.14335   Min.   :-0.67867       
##  1st Qu.:-1.08035   1st Qu.:-0.65445   1st Qu.:-0.67867       
##  Median : 0.33742   Median :-0.16555   Median :-0.36759       
##  Mean   : 0.08819   Mean   : 0.05835   Mean   : 0.03246       
##  3rd Qu.: 0.33742   3rd Qu.: 0.48632   3rd Qu.: 0.25456       
##  Max.   : 1.75520   Max.   : 4.88641   Max.   : 3.98749       
##                                                               
##  Attrition_numeric
##  0:1097           
##  1: 291           
##                   
##                   
##                   
##                   
## 
# Predict and create the confusion matrix using the caret package
predictions <- predict(sales_model, test_data_sales, type = "response")
predicted_classes <- ifelse(predictions > 0.5, 1, 0)
conf_matrix <- confusionMatrix(factor(predicted_classes), factor(test_data_sales$Attrition_numeric))

# Print the detailed confusion matrix and associated statistics
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 791 106
##          1 306 185
##                                           
##                Accuracy : 0.7032          
##                  95% CI : (0.6784, 0.7271)
##     No Information Rate : 0.7903          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2849          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7211          
##             Specificity : 0.6357          
##          Pos Pred Value : 0.8818          
##          Neg Pred Value : 0.3768          
##              Prevalence : 0.7903          
##          Detection Rate : 0.5699          
##    Detection Prevalence : 0.6463          
##       Balanced Accuracy : 0.6784          
##                                           
##        'Positive' Class : 0               
## 
## [1] "Number of rows in test dataset: 1388"
## [1] "Length of prediction vector: 1388"

## [1] "AUC: 0.744351198363548"

Research and Development subset

# Subset data for the Research & Development department from the correct dataset
rd_subset <- subset(ab_dataset_reduced, Department == "Research & Development")

# Remove the 'Department' column as it is not needed after the subset by department
rd_subset$Department <- NULL

# Remove single-level factors which can cause issues in modeling
rd_subset <- rd_subset[, sapply(rd_subset, function(x) !is.factor(x) || length(levels(x)) > 1)]

# Split data into training and testing sets
set.seed(123)  # Set seed for reproducibility
index <- sample(1:nrow(rd_subset), round(0.75 * nrow(rd_subset)))
train_data <- rd_subset[index, ]
test_data_rd <- rd_subset[-index, ]

# Balancing the training dataset by downsampling the majority class
table_train <- table(train_data$Attrition_numeric)
min_class_size <- min(table_train)

# Subsetting each class to separate the majority and minority classes
train_data_0 <- train_data[train_data$Attrition_numeric == 0, ]
train_data_1 <- train_data[train_data$Attrition_numeric == 1, ]

# Sampling from the majority class to balance with the minority class
train_data_0_downsampled <- train_data_0[sample(1:nrow(train_data_0), min_class_size), ]
train_data_balanced <- rbind(train_data_0_downsampled, train_data_1)

# Shuffle rows to mix classes well, ensuring the model isn't biased towards the original order
set.seed(123)
train_data_balanced <- train_data_balanced[sample(nrow(train_data_balanced)), ]

# Fit logistic regression model
tryCatch({
  rd_model <- glm(Attrition_numeric ~ ., data = train_data_balanced, family = binomial())
  print(summary(rd_model))
}, error = function(e) {
  print("Error fitting model:")
  print(e$message)
})
## 
## Call:
## glm(formula = Attrition_numeric ~ ., family = binomial(), data = train_data_balanced)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -2.709372   0.554885  -4.883 1.05e-06 ***
## EducationFieldLife Sciences      0.900856   0.535904   1.681 0.092762 .  
## EducationFieldMedical            0.909116   0.533733   1.703 0.088509 .  
## EducationFieldOther              0.651873   0.558317   1.168 0.242981    
## EducationFieldTechnical Degree   1.265953   0.543632   2.329 0.019875 *  
## Age                             -0.438291   0.038596 -11.356  < 2e-16 ***
## BusinessTravelTravel_Frequently  1.237428   0.168963   7.324 2.41e-13 ***
## BusinessTravelTravel_Rarely      0.711752   0.154845   4.597 4.30e-06 ***
## DistanceFromHome                 0.084496   0.043180   1.957 0.050369 .  
## Education                       -0.030262   0.039970  -0.757 0.448972    
## EnvironmentSatisfaction         -0.072961   0.039429  -1.850 0.064250 .  
## GenderMale                       0.290066   0.079730   3.638 0.000275 ***
## JobInvolvement                  -0.206738   0.038491  -5.371 7.83e-08 ***
## JobSatisfaction                 -0.218284   0.039038  -5.592 2.25e-08 ***
## MaritalStatusMarried             0.234966   0.110855   2.120 0.034041 *  
## MaritalStatusSingle              0.755564   0.139087   5.432 5.56e-08 ***
## MonthlyIncome                   -0.130092   0.046426  -2.802 0.005077 ** 
## NumCompaniesWorked              -0.003873   0.044468  -0.087 0.930600    
## OverTimeYes                      0.870294   0.082148  10.594  < 2e-16 ***
## PercentSalaryHike                0.004104   0.041492   0.099 0.921218    
## PerformanceRating                0.007081   0.041934   0.169 0.865909    
## RelationshipSatisfaction        -0.041873   0.038481  -1.088 0.276526    
## StockOptionLevel                -0.101840   0.050779  -2.006 0.044904 *  
## TrainingTimesLastYear           -0.177414   0.039166  -4.530 5.91e-06 ***
## WorkLifeBalance                 -0.078901   0.038435  -2.053 0.040087 *  
## YearsAtCompany                  -0.044004   0.055136  -0.798 0.424813    
## YearsSinceLastPromotion         -0.020841   0.051443  -0.405 0.685389    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4585.9  on 3307  degrees of freedom
## Residual deviance: 3972.3  on 3281  degrees of freedom
## AIC: 4026.3
## 
## Number of Fisher Scoring iterations: 3
# Predict and evaluate with a confusion matrix
pred_prob <- predict(rd_model, test_data_rd, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
confusion_matrix <- table(Predicted = pred_class, Actual = test_data_rd$Attrition_numeric)
print(confusion_matrix)
##          Actual
## Predicted    0    1
##         0 2182  187
##         1 1100  382
# Calculate and print accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.665801090625811"

Research and Development confusion matrix

# Predict and create the confusion matrix using the caret package
predictions_rd <- predict(rd_model, test_data_rd, type = "response")
predicted_classes_rd <- ifelse(predictions_rd > 0.5, 1, 0)
conf_matrix_rd <- confusionMatrix(factor(predicted_classes_rd), factor(test_data_rd$Attrition_numeric))

# Print the detailed confusion matrix and associated statistics
print(conf_matrix_rd)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2182  187
##          1 1100  382
##                                           
##                Accuracy : 0.6658          
##                  95% CI : (0.6507, 0.6807)
##     No Information Rate : 0.8522          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2021          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6648          
##             Specificity : 0.6714          
##          Pos Pred Value : 0.9211          
##          Neg Pred Value : 0.2578          
##              Prevalence : 0.8522          
##          Detection Rate : 0.5666          
##    Detection Prevalence : 0.6152          
##       Balanced Accuracy : 0.6681          
##                                           
##        'Positive' Class : 0               
## 

R&D subset ROC curve

  # Predict probabilities for the test set
  pred_prob <- NULL
  tryCatch({
    pred_prob <- predict(rd_model, newdata = test_data_rd, type = "response")
  }, error = function(e) {
    print("Error in prediction:")
    print(e$message)
  })
  
  # Only proceed if pred_prob has been successfully created
  if (!is.null(pred_prob) && length(pred_prob) == length(test_data_rd$Attrition_numeric)) {
    library(pROC)
    
    # Generate ROC curve
    roc_curve <- roc(response = test_data_rd$Attrition_numeric, predictor = pred_prob)
    
    # Plot ROC curve
    plot(roc_curve, main = "ROC Curve for R&D Logistic Model")
    abline(a = 0, b = 1, col = "red", lty = 2)  # Diagonal line for reference
    
    # Calculate AUC
    auc_value <- auc(roc_curve)
    print(paste("AUC:", auc_value))
    
    # Optional: Add AUC to the plot
    legend("bottomright", legend = paste("AUC =", round(auc_value, 4)), box.lty = 1, col = "black", bty = "n")
  } else {
    print("Error: Unable to generate ROC curve due to previous error or mismatch in data lengths.")
  }
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## [1] "AUC: 0.725022463691284"

Human Resources Subset

# Subset data for the Human Resources department from the updated dataset name
hr_subset <- subset(ab_dataset_reduced, Department == "Human Resources")

# Remove the 'Department' column as it is no longer needed after the subset by department
hr_subset$Department <- NULL

# Remove single-level factors which can cause issues in modeling
hr_subset <- hr_subset[, sapply(hr_subset, function(x) !is.factor(x) || length(levels(x)) > 1)]

# Split data into training and testing sets
set.seed(123)  # Set seed for reproducibility
index <- sample(1:nrow(hr_subset), round(0.75 * nrow(hr_subset)))
train_data <- hr_subset[index, ]
test_data_hr <- hr_subset[-index, ]

# Balancing the training dataset by downsampling the majority class
table_train <- table(train_data$Attrition_numeric)
min_class_size <- min(table_train)

# Subsetting each class to separate the majority and minority classes
train_data_0 <- train_data[train_data$Attrition_numeric == 0, ]
train_data_1 <- train_data[train_data$Attrition_numeric == 1, ]

# Sampling from the majority class to balance with the minority class
train_data_0_downsampled <- train_data_0[sample(1:nrow(train_data_0), min_class_size), ]
train_data_balanced <- rbind(train_data_0_downsampled, train_data_1)

# Shuffle rows to mix classes well, ensuring the model isn't biased towards the original order
set.seed(123)
train_data_balanced <- train_data_balanced[sample(nrow(train_data_balanced)), ]

# Fit logistic regression model
tryCatch({
  hr_model <- glm(Attrition_numeric ~ ., data = train_data_balanced, family = binomial())
  print(summary(hr_model))
}, error = function(e) {
  print("Error fitting model:")
  print(e$message)
})
## 
## Call:
## glm(formula = Attrition_numeric ~ ., family = binomial(), data = train_data_balanced)
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -0.66261    0.75415  -0.879 0.379609    
## EducationFieldLife Sciences     -0.73767    0.40108  -1.839 0.065880 .  
## EducationFieldMarketing         -0.72497    0.66125  -1.096 0.272916    
## EducationFieldMedical            0.34875    0.46547   0.749 0.453713    
## EducationFieldOther              0.44066    0.54978   0.802 0.422822    
## EducationFieldTechnical Degree  -0.11687    0.66201  -0.177 0.859876    
## Age                             -0.78064    0.14690  -5.314 1.07e-07 ***
## BusinessTravelTravel_Frequently  1.34156    0.64306   2.086 0.036960 *  
## BusinessTravelTravel_Rarely      1.01639    0.60731   1.674 0.094210 .  
## DistanceFromHome                 0.30631    0.14073   2.177 0.029508 *  
## Education                        0.47500    0.13697   3.468 0.000525 ***
## EnvironmentSatisfaction         -0.26257    0.11520  -2.279 0.022659 *  
## GenderMale                      -0.25698    0.26559  -0.968 0.333250    
## JobInvolvement                  -0.16369    0.13484  -1.214 0.224775    
## JobSatisfaction                  0.01171    0.12541   0.093 0.925614    
## MaritalStatusMarried            -0.48239    0.33553  -1.438 0.150522    
## MaritalStatusSingle             -0.52188    0.45337  -1.151 0.249687    
## MonthlyIncome                   -0.50460    0.17893  -2.820 0.004800 ** 
## NumCompaniesWorked              -0.20397    0.12112  -1.684 0.092183 .  
## OverTimeYes                      0.34287    0.26691   1.285 0.198930    
## PercentSalaryHike               -0.43075    0.14070  -3.062 0.002202 ** 
## PerformanceRating                0.24800    0.14072   1.762 0.078005 .  
## RelationshipSatisfaction        -0.20449    0.13423  -1.523 0.127656    
## StockOptionLevel                -0.25464    0.15249  -1.670 0.094933 .  
## TrainingTimesLastYear           -0.60331    0.15039  -4.012 6.03e-05 ***
## WorkLifeBalance                  0.07025    0.13359   0.526 0.598949    
## YearsAtCompany                   0.31387    0.16130   1.946 0.051673 .  
## YearsSinceLastPromotion          0.25176    0.16435   1.532 0.125557    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 634.92  on 457  degrees of freedom
## Residual deviance: 461.35  on 430  degrees of freedom
## AIC: 517.35
## 
## Number of Fisher Scoring iterations: 5
# Predict and evaluate with a confusion matrix
pred_prob <- predict(hr_model, test_data_hr, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
confusion_matrix <- table(Predicted = pred_class, Actual = test_data_hr$Attrition_numeric)
print(confusion_matrix)
##          Actual
## Predicted   0   1
##         0 325  26
##         1 139  59
# Calculate and print accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.699453551912568"
# Predict and create the confusion matrix using the caret package
predictions_hr <- predict(hr_model, test_data_hr, type = "response")
predicted_classes_hr <- ifelse(predictions_hr > 0.5, 1, 0)
conf_matrix_hr <- confusionMatrix(factor(predicted_classes_hr), factor(test_data_hr$Attrition_numeric))

# Print the detailed confusion matrix and associated statistics
print(conf_matrix_hr)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 325  26
##          1 139  59
##                                           
##                Accuracy : 0.6995          
##                  95% CI : (0.6592, 0.7376)
##     No Information Rate : 0.8452          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2557          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7004          
##             Specificity : 0.6941          
##          Pos Pred Value : 0.9259          
##          Neg Pred Value : 0.2980          
##              Prevalence : 0.8452          
##          Detection Rate : 0.5920          
##    Detection Prevalence : 0.6393          
##       Balanced Accuracy : 0.6973          
##                                           
##        'Positive' Class : 0               
## 
# Fit logistic regression model
tryCatch({
  hr_model <- glm(Attrition_numeric ~ ., data = train_data_balanced, family = binomial())
}, error = function(e) {
  print("Error in model fitting:")
  print(e$message)
})

# Make predictions on the test dataset
tryCatch({
  pred_prob <- predict(hr_model, newdata = test_data_hr, type = "response")
}, error = function(e) {
  print("Error in prediction:")
  print(e$message)
})

# Re-check the lengths
print(paste("Re-checked length of actual test responses:", length(test_data_hr$Attrition_numeric)))
## [1] "Re-checked length of actual test responses: 549"
print(paste("Re-checked length of predicted probabilities:", length(pred_prob)))
## [1] "Re-checked length of predicted probabilities: 549"

HR Subset ROC Curve

# Only proceed if lengths match
if (length(test_data_hr$Attrition_numeric) == length(pred_prob)) {
  library(pROC)

  # Generate ROC curve
  roc_curve <- roc(response = test_data_hr$Attrition_numeric, predictor = pred_prob)

  # Plot ROC curve
  plot(roc_curve, main = "ROC Curve for HR Department Logistic Regression Model")
  abline(a = 0, b = 1, col = "red")  # Diagonal reference line

  # Calculate AUC
  auc_value <- auc(roc_curve)
  print(paste("AUC:", auc_value))
  
  # Optional: Add AUC to the plot
  legend("bottomright", legend = paste("AUC =", round(auc_value, 4)), box.lty = 1, col = "black", bty = "n")
} else {
  print("Length mismatch between test responses and predictions. Unable to generate ROC curve.")
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## [1] "AUC: 0.784888438133874"

Comparison of Model accuracies across subsets (HR, Sales, R&D)

# Define a function to summarize and print model results
print_model_summary <- function(department, model, test_data) {
  cat("\n----------------------------------------\n")
  cat("Department:", department, "\n")
  pred_prob <- predict(model, test_data, type = "response")
  pred_class <- ifelse(pred_prob > 0.5, 1, 0)
  confusion_matrix <- table(Predicted = pred_class, Actual = test_data$Attrition_numeric)
  accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
  
  cat("Confusion Matrix:\n")
  print(confusion_matrix)
  cat("Accuracy:", sprintf("%.2f%%", accuracy * 100), "\n")
}


# Print summaries
print_model_summary("Sales", sales_model, test_data_sales)
## 
## ----------------------------------------
## Department: Sales 
## Confusion Matrix:
##          Actual
## Predicted   0   1
##         0 791 106
##         1 306 185
## Accuracy: 70.32%
print_model_summary("Research & Development", rd_model, test_data_rd)
## 
## ----------------------------------------
## Department: Research & Development 
## Confusion Matrix:
##          Actual
## Predicted    0    1
##         0 2182  187
##         1 1100  382
## Accuracy: 66.58%
print_model_summary("Human Resources", hr_model, test_data_hr)
## 
## ----------------------------------------
## Department: Human Resources 
## Confusion Matrix:
##          Actual
## Predicted   0   1
##         0 325  26
##         1 139  59
## Accuracy: 69.95%

Propensity Score Matching

data <- ab_dataset_reduced
# Calculate propensity scores
ps_model <- glm(Attrition_numeric ~ ., data = trainingdata_balanced_ab, family = "binomial")

# Perform matching
match_data <- matchit(Attrition_numeric ~ ., data = trainingdata_balanced_ab, method = "nearest", model = ps_model)

# Extract matched data
matched_data <- match.data(match_data)

set.seed(123)  # For reproducibility
split <- sample.split(matched_data$Attrition_numeric, SplitRatio = 0.7)
train_data <- subset(matched_data, split == TRUE)
test_data <- subset(matched_data, split == FALSE)
# Fit logistic regression on the balanced training data

reduced_model <- glm(Attrition_numeric ~ .,
                     data = trainingdata_balanced_ab, family = binomial())


# Calculate VIF to check for multicollinearity

vif(reduced_model)  # A VIF value greater than 5 is typically cause for concern
##                              GVIF Df GVIF^(1/(2*Df))
## Department               2.132924  2        1.208492
## EducationField           2.535214  6        1.080607
## Age                      1.082284  1        1.040329
## BusinessTravel           1.044822  2        1.011022
## DistanceFromHome         1.230991  1        1.109500
## Education                1.035204  1        1.017450
## EnvironmentSatisfaction  1.085291  1        1.041773
## Gender                   1.026220  1        1.013025
## JobInvolvement           1.014915  1        1.007430
## JobSatisfaction          1.021082  1        1.010486
## MaritalStatus            1.734611  2        1.147626
## MonthlyIncome            1.474362  1        1.214233
## NumCompaniesWorked       1.139318  1        1.067388
## OverTime                 1.034080  1        1.016897
## PercentSalaryHike        1.212110  1        1.100959
## PerformanceRating        1.191801  1        1.091696
## RelationshipSatisfaction 1.017034  1        1.008481
## StockOptionLevel         1.693762  1        1.301446
## TrainingTimesLastYear    1.027627  1        1.013719
## WorkLifeBalance          1.016379  1        1.008156
## YearsAtCompany           2.043510  1        1.429514
## YearsSinceLastPromotion  1.657952  1        1.287615
# Predict using the reduced model
predicted_probabilities <- predict(reduced_model, newdata = testingdata_reduced_ab, type = "response")
predicted_classes <- ifelse(predicted_probabilities > 0.5, 1, 0)

# Generate a confusion matrix to evaluate the model

confusionMatrix(as.factor(predicted_classes), as.factor(testingdata_reduced_ab$Attrition_numeric))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3886  347
##          1 1963  750
##                                           
##                Accuracy : 0.6674          
##                  95% CI : (0.6562, 0.6785)
##     No Information Rate : 0.8421          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2178          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6644          
##             Specificity : 0.6837          
##          Pos Pred Value : 0.9180          
##          Neg Pred Value : 0.2764          
##              Prevalence : 0.8421          
##          Detection Rate : 0.5595          
##    Detection Prevalence : 0.6094          
##       Balanced Accuracy : 0.6740          
##                                           
##        'Positive' Class : 0               
## 

Variable Interaction for Logistic regression

# Set seed for reproducibility
set.seed(123)

# Splitting the dataset into training and testing sets (70:30)
split <- createDataPartition(training_set$Attrition_numeric, p = 0.7, list = FALSE, times = 1)
trainingset_interaction <- training_set[split, ]
testset_interaction <- training_set[-split, ]

# Balancing the 'Attrition_numeric' class in the training set using upSample
balanceddata_interaction <- upSample(x = trainingset_interaction[, names(trainingset_interaction) != "Attrition_numeric"],
                                     y = trainingset_interaction$Attrition_numeric)

# Check if the target variable is in the dataset
# Since upSample does not change the target variable name, no renaming is needed
balanceddata_interaction$Attrition_numeric <- as.factor(balanceddata_interaction$Class)
balanceddata_interaction$Class <- NULL  # Clean up, just in case

# Fit logistic regression model on balanced training set
logistic_model_interaction <- glm(Attrition_numeric ~ Department + EducationField + Age + 
                                  BusinessTravel + DailyRate + DistanceFromHome + Education + 
                                  EnvironmentSatisfaction + Gender + HourlyRate + JobInvolvement + 
                                  JobSatisfaction + MaritalStatus + MonthlyRate + NumCompaniesWorked + 
                                  OverTime + PercentSalaryHike + PerformanceRating + RelationshipSatisfaction + 
                                  StockOptionLevel + TrainingTimesLastYear + WorkLifeBalance + 
                                  YearsSinceLastPromotion + YearsWithCurrManager +
                                  MonthlyIncome * JobLevel + TotalWorkingYears * JobLevel + 
                                  YearsInCurrentRole * YearsAtCompany, 
                                  data = balanceddata_interaction, family = binomial())

# Print out the results of the logistic regression model
summary(logistic_model_interaction)
## 
## Call:
## glm(formula = Attrition_numeric ~ Department + EducationField + 
##     Age + BusinessTravel + DailyRate + DistanceFromHome + Education + 
##     EnvironmentSatisfaction + Gender + HourlyRate + JobInvolvement + 
##     JobSatisfaction + MaritalStatus + MonthlyRate + NumCompaniesWorked + 
##     OverTime + PercentSalaryHike + PerformanceRating + RelationshipSatisfaction + 
##     StockOptionLevel + TrainingTimesLastYear + WorkLifeBalance + 
##     YearsSinceLastPromotion + YearsWithCurrManager + MonthlyIncome * 
##     JobLevel + TotalWorkingYears * JobLevel + YearsInCurrentRole * 
##     YearsAtCompany, family = binomial(), data = balanceddata_interaction)
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -2.4862341  0.6400046  -3.885 0.000102 ***
## DepartmentResearch & Development   0.0758041  0.1507507   0.503 0.615074    
## DepartmentSales                    0.5766002  0.1702553   3.387 0.000707 ***
## EducationFieldHuman Resources      0.7813142  0.7140475   1.094 0.273865    
## EducationFieldLife Sciences        0.5862909  0.6036344   0.971 0.331415    
## EducationFieldMarketing            0.3417749  0.6211637   0.550 0.582170    
## EducationFieldMedical              0.5814716  0.6022051   0.966 0.334259    
## EducationFieldOther                0.5755311  0.6188361   0.930 0.352360    
## EducationFieldTechnical Degree     1.0251029  0.6150993   1.667 0.095601 .  
## Age                               -0.4074443  0.0391830 -10.398  < 2e-16 ***
## BusinessTravelTravel_Frequently    1.4106405  0.1579973   8.928  < 2e-16 ***
## BusinessTravelTravel_Rarely        0.6633699  0.1426172   4.651 3.30e-06 ***
## DailyRate                         -0.1841439  0.0377842  -4.874 1.10e-06 ***
## DistanceFromHome                   0.1252703  0.0415807   3.013 0.002589 ** 
## Education                          0.0008085  0.0378953   0.021 0.982978    
## EnvironmentSatisfaction           -0.1984853  0.0415693  -4.775 1.80e-06 ***
## GenderMale                         0.1278592  0.0779162   1.641 0.100801    
## HourlyRate                         0.1461968  0.0423125   3.455 0.000550 ***
## JobInvolvement                    -0.1804064  0.0361011  -4.997 5.82e-07 ***
## JobSatisfaction                   -0.1237468  0.0369841  -3.346 0.000820 ***
## MaritalStatusMarried               0.1937180  0.1055471   1.835 0.066451 .  
## MaritalStatusSingle                0.5668393  0.1322220   4.287 1.81e-05 ***
## MonthlyRate                       -0.0635002  0.0382853  -1.659 0.097195 .  
## NumCompaniesWorked                 0.0053192  0.0397640   0.134 0.893584    
## OverTimeYes                        0.9370381  0.0807609  11.603  < 2e-16 ***
## PercentSalaryHike                 -0.1197368  0.0420129  -2.850 0.004372 ** 
## PerformanceRating                  0.0826448  0.0429547   1.924 0.054355 .  
## RelationshipSatisfaction          -0.0625610  0.0373027  -1.677 0.093519 .  
## StockOptionLevel                  -0.0927660  0.0483766  -1.918 0.055164 .  
## TrainingTimesLastYear             -0.1425336  0.0391973  -3.636 0.000277 ***
## WorkLifeBalance                   -0.0695363  0.0367658  -1.891 0.058580 .  
## YearsSinceLastPromotion            0.1747860  0.0479935   3.642 0.000271 ***
## YearsWithCurrManager              -0.0831406  0.0700081  -1.188 0.234996    
## MonthlyIncome                     -0.1216715  0.1313998  -0.926 0.354465    
## JobLevel                          -0.1515737  0.1206870  -1.256 0.209144    
## TotalWorkingYears                  0.1712470  0.0700922   2.443 0.014559 *  
## YearsInCurrentRole                -0.3204637  0.0672382  -4.766 1.88e-06 ***
## YearsAtCompany                     0.0199024  0.0834474   0.239 0.811491    
## MonthlyIncome:JobLevel             0.0420247  0.0589718   0.713 0.476079    
## JobLevel:TotalWorkingYears        -0.0247818  0.0609996  -0.406 0.684550    
## YearsInCurrentRole:YearsAtCompany  0.0640496  0.0380298   1.684 0.092144 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4968.5  on 3583  degrees of freedom
## Residual deviance: 4229.8  on 3543  degrees of freedom
## AIC: 4311.8
## 
## Number of Fisher Scoring iterations: 3
# Predicting on the test set
predictions <- predict(logistic_model_interaction, newdata = testset_interaction, type = "response")
predicted_classes <- ifelse(predictions > 0.5, 1, 0)

# Evaluate model performance
confusionMatrix <- table(Predicted = predicted_classes, Actual = testset_interaction$Attrition_numeric)
accuracy <- sum(diag(confusionMatrix)) / sum(confusionMatrix)

# Print the confusion matrix and accuracy
print(confusionMatrix)
##          Actual
## Predicted   0   1
##         0 502 236
##         1 266 532
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.673177083333333"

Interpretation of Coefficients based on the interaction Logistic model -

OverTimeYes (Coefficient: 0.9370, p-value: < 2e-16) Most significant predictor of attrition. Employees working overtime are more likely to leave. Age (Coefficient: -0.4074, p-value: < 2e-16) Significant negative predictor. Older employees are less likely to leave. BusinessTravelTravel_Frequently (Coefficient: 1.4106, p-value: < 2e-16) Frequent business travelers are more likely to leave. DailyRate (Coefficient: -0.1841, p-value: 1.10e-06) Higher daily rates are associated with lower attrition. EnvironmentSatisfaction (Coefficient: -0.1985, p-value: 1.80e-06) Higher satisfaction leads to lower attrition. BusinessTravelTravel_Rarely (Coefficient: 0.6634, p-value: 3.30e-06) Less frequent travel still predicts higher attrition compared to no travel. YearsInCurrentRole (Coefficient: -0.3205, p-value: 1.88e-06) Longer time in current role associates with lower attrition. JobInvolvement (Coefficient: -0.1804, p-value: 5.82e-07) Higher job involvement is linked to lower attrition. JobSatisfaction (Coefficient: -0.1237, p-value: 0.000820) Higher job satisfaction reduces attrition. DistanceFromHome (Coefficient: 0.1253, p-value: 0.002589) Longer commutes increase the likelihood of leaving.

Confusion Matrix for Logistic model with Variable interaction

# Convert predicted_classes and actual values to factors to ensure compatibility with confusionMatrix function
predicted_factors <- factor(predicted_classes, levels = c(0, 1))
actual_factors <- factor(testset_interaction$Attrition_numeric, levels = c(0, 1))

evaluation_results <- confusionMatrix(predicted_factors, actual_factors)

# Print the evaluation results which include confusion matrix, overall statistics, and class statistics
print(evaluation_results)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 502 236
##          1 266 532
##                                           
##                Accuracy : 0.6732          
##                  95% CI : (0.6491, 0.6966)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.3464          
##                                           
##  Mcnemar's Test P-Value : 0.1956          
##                                           
##             Sensitivity : 0.6536          
##             Specificity : 0.6927          
##          Pos Pred Value : 0.6802          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3268          
##    Detection Prevalence : 0.4805          
##       Balanced Accuracy : 0.6732          
##                                           
##        'Positive' Class : 0               
## 
library(ggplot2)
library(dplyr)

model_summary <- summary(logistic_model_interaction)
# Create a data frame of variables, their coefficients, and p-values
coefficients <- model_summary$coefficients
data <- as.data.frame(coefficients)
data$Variable <- rownames(coefficients)
rownames(data) <- NULL  # Clean up row names

# Add direction based on the sign of the coefficient
data$Direction <- ifelse(data$Estimate > 0, "Positive", "Negative")

# Remove the intercept and split the data frame into positive and negative coefficients
positive_coeffs <- data %>%
  filter(Estimate > 0 & Variable != "(Intercept)") %>%
  arrange(desc(Estimate))

negative_coeffs <- data %>%
  filter(Estimate < 0 & Variable != "(Intercept)") %>%
  arrange(Estimate)

# Define colors for direction
positive_color <- "#EF553B"
negative_color <- "#00CC96"

# Function to clean up the plot look
clean_theme <- function(p) {
  p + theme_minimal() +
    theme(
      text = element_text(size = 16),
      axis.title.y = element_text(size = 18),
      axis.text.y = element_text(size = 12),
      axis.ticks.y = element_blank(),
      panel.grid.major.y = element_blank(),
      panel.grid.minor.y = element_blank(),
      legend.position = "none"
    )
}

# Function to add coefficient labels inside the bars
add_coefficient_labels <- function(p, data) {
  p + geom_text(
    data = data,
    aes(label = sprintf("%.2f", Estimate), x = reorder(Variable, Estimate), y = ifelse(Direction == "Positive", Estimate - 0.05 * max(data$Estimate), Estimate + 0.05 * abs(min(data$Estimate)))),
    size = 3,
    hjust = ifelse(data$Direction == "Positive", 1.1, -0.1),
    color = "white"
  )
}

# Create the positive coefficients bar plot
p1 <- ggplot(positive_coeffs, aes(x = reorder(Variable, Estimate), y = Estimate, fill = Direction)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = positive_color) +
  labs(title = "Positive Coefficients for Attrition")

# Create the negative coefficients bar plot
p2 <- ggplot(negative_coeffs, aes(x = reorder(Variable, -Estimate), y = Estimate, fill = Direction)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = negative_color) +
  labs(title = "Negative Coefficients for Attrition")

# Apply the custom theme to the plot
p1 <- clean_theme(p1)
p1 <- add_coefficient_labels(p1, positive_coeffs)

# Apply the custom theme to the plot
p2 <- clean_theme(p2)
p2 <- add_coefficient_labels(p2, negative_coeffs)

# Print the plots
print(p1)

print(p2)

# Convert predicted_classes and actual values to factors to ensure compatibility with confusionMatrix function
predicted_factors <- factor(predicted_classes, levels = c(0, 1))
actual_factors <- factor(testset_interaction$Attrition_numeric, levels = c(0, 1))

# Load the caret package if you haven't already
library(caret)

evaluation_results <- confusionMatrix(predicted_factors, actual_factors)

# Print the evaluation results which include confusion matrix, overall statistics, and class statistics
print(evaluation_results)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 502 236
##          1 266 532
##                                           
##                Accuracy : 0.6732          
##                  95% CI : (0.6491, 0.6966)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.3464          
##                                           
##  Mcnemar's Test P-Value : 0.1956          
##                                           
##             Sensitivity : 0.6536          
##             Specificity : 0.6927          
##          Pos Pred Value : 0.6802          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3268          
##    Detection Prevalence : 0.4805          
##       Balanced Accuracy : 0.6732          
##                                           
##        'Positive' Class : 0               
## 

Decision Tree model with selected Variables (Without Jobrole, Daily rate, monthly rate etc)

# Building the decision tree model 
decision_tree_model <- rpart(Attrition_numeric ~ .,
                             data = trainingdata_balanced_ab, 
                             method = "class")

Decision Tree predictions

# Predicting using the decision tree model
tree_predictions <- predict(decision_tree_model, newdata = testingdata_reduced_ab, type = "class")

# Convert predictions to factor to ensure compatibility with confusionMatrix function
# It's essential that the levels match those in the actual outcome variable
tree_predictions_factor <- factor(tree_predictions, levels = levels(testingdata_reduced_ab$Attrition_numeric))

# Evaluation using confusion matrix
tree_evaluation_results <- confusionMatrix(tree_predictions_factor, testingdata_reduced_ab$Attrition_numeric)

# Print the evaluation results
print(tree_evaluation_results)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4120  390
##          1 1729  707
##                                          
##                Accuracy : 0.6949         
##                  95% CI : (0.684, 0.7057)
##     No Information Rate : 0.8421         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.2332         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.7044         
##             Specificity : 0.6445         
##          Pos Pred Value : 0.9135         
##          Neg Pred Value : 0.2902         
##              Prevalence : 0.8421         
##          Detection Rate : 0.5931         
##    Detection Prevalence : 0.6493         
##       Balanced Accuracy : 0.6744         
##                                          
##        'Positive' Class : 0              
## 

Cross Validation for Decision Trees

# Set up cross-validation settings
train_control <- trainControl(
  method = "cv",          # Use k-fold cross-validation
  number = 5,            # Number of folds in k-fold cross-validation
  savePredictions = "final",
  classProbs = TRUE       # Store class probabilities (useful for ROC, AUC)
)
# Check current levels of the factor
levels(trainingdata_balanced_ab$Attrition_numeric)
## [1] "0" "1"
# Adjust the levels to be valid R variable names
levels(trainingdata_balanced_ab$Attrition_numeric) <- make.names(levels(trainingdata_balanced_ab$Attrition_numeric))

# Verify the changes
levels(trainingdata_balanced_ab$Attrition_numeric)
## [1] "X0" "X1"

10 Fold Cross validation

# Set up cross-validation settings
train_control <- trainControl(
  method = "cv",          # Use k-fold cross-validation
  number = 10,            # Number of folds in k-fold cross-validation
  savePredictions = "final",
  classProbs = TRUE       # Store class probabilities
)

# Train the model with cross-validation
set.seed(123)  # For reproducibility
decision_tree_model_cv <- train(
  Attrition_numeric ~ .,   # Formula
  data = trainingdata_balanced_ab,  # Training data
  method = "rpart",        # Training method (decision tree)
  trControl = train_control,  # Control object
  metric = "Accuracy"      # Performance metric
)

# Print the model summary
print(decision_tree_model_cv)
## CART 
## 
## 27300 samples
##    22 predictor
##     2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 24570, 24570, 24570, 24570, 24570, 24570, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.01978022  0.6584982  0.3169963
##   0.02849817  0.6279121  0.2558242
##   0.23626374  0.5683883  0.1367766
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01978022.
# Extract predictions and observed values from the model
predictions <- decision_tree_model_cv$pred

# Select the final predicted classes 
final_predictions <- predictions$pred

# The observed values
observed_values <- predictions$obs

# Generate the confusion matrix
conf_matrix <- confusionMatrix(data = factor(final_predictions, levels = c("X0", "X1")),
                               reference = factor(observed_values, levels = c("X0", "X1")))

# Print the confusion matrix
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0 9504 5177
##         X1 4146 8473
##                                           
##                Accuracy : 0.6585          
##                  95% CI : (0.6528, 0.6641)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.317           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6963          
##             Specificity : 0.6207          
##          Pos Pred Value : 0.6474          
##          Neg Pred Value : 0.6714          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3481          
##    Detection Prevalence : 0.5378          
##       Balanced Accuracy : 0.6585          
##                                           
##        'Positive' Class : X0              
## 
# Extract results
results <- decision_tree_model_cv$results

# Plotting the accuracy across different 'cp' values
ggplot(data = results, aes(x = cp, y = Accuracy)) +
  geom_line() +  # Line plot to show the trend
  geom_point() + # Points to mark each tested 'cp' value
  labs(title = "Accuracy vs Complexity Parameter (cp)",
       x = "Complexity Parameter (cp)",
       y = "Accuracy") +
  theme_minimal()  # Using a minimal theme for cleaner visualization

Decision Tree Cross validated optimal CP value

# Train the decision tree model using rpart with the optimal cp value
decision_tree <- rpart(Attrition_numeric ~ ., 
                       data = trainingdata_balanced_ab, 
                       method = "class", 
                       control = rpart.control(cp = 0.01978022))

# If you wish to train using the entire dataset and visualize, otherwise skip this
# and use the model from the train() function for accuracy results.

Decision Tree Cross validated Plot

# Plot the decision tree
rpart.plot(decision_tree, 
           main = "Decision Tree Plot", 
           extra = 104,  # Display splits and class distribution
           under = TRUE,  # Display the numbers under the node
           faclen = 0)  # Factor length, use 0 to auto adjust

Decision Tree plot with all variables

# Plotting the decision tree
rpart.plot(decision_tree_model)

Decision Tree Variable Importance

variable_importance <- decision_tree_model$variable.importance

# Order variables by importance in descending order
ordered_importance <- sort(variable_importance, decreasing = TRUE)

# Print the ordered variable importance
ordered_importance
##                     Age                OverTime        StockOptionLevel 
##             785.6839521             533.4903002             369.9393083 
##           MaritalStatus           MonthlyIncome          BusinessTravel 
##             276.8573304             206.9696411             109.6644593 
##               Education          YearsAtCompany       PercentSalaryHike 
##              65.3445040              64.4883620              18.5971663 
##   TrainingTimesLastYear      NumCompaniesWorked EnvironmentSatisfaction 
##              16.7410552              15.4888790              13.1240286 
##          EducationField YearsSinceLastPromotion 
##               4.1421277               0.3944884

Variable Importance plot

# Provided variable importance scores
variable_importance <- c(
  Age = 785.6839521, OverTime = 533.4903002, StockOptionLevel = 369.9393083, 
  MaritalStatus = 276.8573304, BusinessTravel = 109.6644593, Education = 65.3445040, 
  YearsAtCompany = 64.4883620, PercentSalaryHike = 18.5971663, TrainingTimesLastYear = 16.7410552, 
  NumCompaniesWorked = 15.4888790, EnvironmentSatisfaction = 13.1240286, EducationField = 4.1421277, 
  YearsSinceLastPromotion = 0.3944884
)

# Sort the importance scores in descending order
sorted_importance <- sort(variable_importance, decreasing = TRUE)

# Print sorted variable importance
print(sorted_importance)
##                     Age                OverTime        StockOptionLevel 
##             785.6839521             533.4903002             369.9393083 
##           MaritalStatus          BusinessTravel               Education 
##             276.8573304             109.6644593              65.3445040 
##          YearsAtCompany       PercentSalaryHike   TrainingTimesLastYear 
##              64.4883620              18.5971663              16.7410552 
##      NumCompaniesWorked EnvironmentSatisfaction          EducationField 
##              15.4888790              13.1240286               4.1421277 
## YearsSinceLastPromotion 
##               0.3944884
# Create a data frame for plotting
importance_df <- data.frame(Variable = names(sorted_importance), Importance = sorted_importance)

# Plot using ggplot2 without reversal
ggplot(importance_df, aes(x = reorder(Variable, Importance), y = Importance, fill = Importance)) +
  geom_col() +  # Use geom_col() which is equivalent to geom_bar(stat="identity")
  scale_fill_gradient(low = "blue", high = "red") +  # Color gradient from blue to red
  coord_flip() +  # Flip coordinates for horizontal bar plot
  xlab("Variable") + 
  ylab("Importance") +
  ggtitle("Variable Importance from Model") +
  theme_minimal() +  # Apply a minimal theme
  theme(legend.position = "none")  # Hide the legend if not required

#Full Decision Tree model with all predictors

cp_value <- 0.01  # Adjust this as necessary

# Rebuild the decision tree model with the new complexity parameter
tree_model <- rpart(Attrition_numeric ~ ., data = training_set, method = "class", cp = cp_value)

# Plot the decision tree with adjusted parameters
rpart.plot(tree_model, main="Decision Tree for Attrition Prediction",
           box.palette="RdYlGn",       # Use a red-yellow-green color palette
           shadow.col="gray",          # Add shadow for a 3D effect
           cex=0.65,                    # Adjust text size as needed
           tweak= 1,                  # Adjust the overall size of plot elements
           fallen.leaves=FALSE,         # Align leaf nodes at the bottom of the graph
           type=4,                     # Use enhanced text and splits
           extra=104)                  # Display extra node information
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

# Evaluate the model on the testing set
predictions <- predict(tree_model, newdata = testing_set, type = "class")
confusionMatrix(predictions, testing_set$Attrition_numeric)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4015  365
##          1 1834  732
##                                           
##                Accuracy : 0.6834          
##                  95% CI : (0.6723, 0.6943)
##     No Information Rate : 0.8421          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2291          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6864          
##             Specificity : 0.6673          
##          Pos Pred Value : 0.9167          
##          Neg Pred Value : 0.2853          
##              Prevalence : 0.8421          
##          Detection Rate : 0.5780          
##    Detection Prevalence : 0.6306          
##       Balanced Accuracy : 0.6769          
##                                           
##        'Positive' Class : 0               
## 
# Extract variable importance from the rpart model
var_importance <- as.data.frame(tree_model$variable.importance)

# Give names to the dataframe
names(var_importance) <- c("Importance")

# Sort the variables by importance
var_importance_sorted <- var_importance[order(-var_importance$Importance), , drop = FALSE]

# Create a barplot
barplot(var_importance_sorted$Importance, main = "Variable Importance",
        xlab = "Variables", ylab = "Importance", las = 2, cex.names = 0.7)

# If you have a lot of variables and the names are cluttered, you can increase the margin size
par(mar = c(12, 5, 4, 2) + 0.1)  # Adjust the bottom margin to fit variable names
barplot(var_importance_sorted$Importance, main = "Variable Importance",
        xlab = "Variables", ylab = "Importance", las = 2, cex.names = 0.7)

Variable importance for DT model 2 (With full predictors)

var_importance_sorted <- data.frame(
  Variable = rownames(var_importance_sorted),
  Importance = var_importance_sorted$Importance
)

ggplot(var_importance_sorted, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() + # Flip coordinates to make bars horizontal
  labs(title = "Variable Importance", 
       x = "Importance", 
       y = "") + # Y label is not needed since variables are on the Y axis
  theme_minimal() +
  theme(axis.text.y = element_text(angle = 0, hjust = 1, size = 8), # Adjust the text size as needed
        axis.title.y = element_blank(), # Remove the Y axis title
        plot.title = element_text(hjust = 0.5)) # Center the plot title

Splitting of data into subsets for Sales, Human Resource and Research and development for Decision Trees

# Create subsets for each department
sales_data <- subset(ab_dataset_reduced, Department == "Sales")
hr_data <- subset(ab_dataset_reduced, Department == "Human Resources")
rnd_data <- subset(ab_dataset_reduced, Department == "Research & Development")

# Function to process a department dataset
process_department_data <- function(data) {
  set.seed(123) # for reproducibility
  
  # Split data into training and testing
  index <- createDataPartition(y = data$Attrition_numeric, p = 0.7, list = FALSE)
  training_set <- data[index, ]
  testing_set <- data[-index, ]
  
  # Balance attrition in the training set
  downsampled_data <- downSample(x = training_set[, -which(names(training_set) == "Attrition_numeric")],
                                 y = training_set$Attrition_numeric)
  
  # Fix the outcome variable name after downsampling
  downsampled_data$Attrition_numeric <- downsampled_data$Class
  downsampled_data$Class <- NULL  # Remove the 'Class' column as it's no longer needed

  # Convert Attrition_numeric to a factor (if not already)
  downsampled_data$Attrition_numeric <- factor(downsampled_data$Attrition_numeric, levels = c(0, 1))


  # Run a decision tree model
  tree_model <- rpart(Attrition_numeric ~ ., data = downsampled_data, method = "class")

  # Predict on the testing set
  predictions <- predict(tree_model, newdata = testing_set, type = "class")
  
  # Evaluate the model
  confusion <- confusionMatrix(predictions, testing_set$Attrition_numeric)
  
  # Return the model and its evaluation
  return(list(model = tree_model, evaluation = confusion))
}

# Process each department dataset
sales_results <- process_department_data(sales_data)
hr_results <- process_department_data(hr_data)
rnd_results <- process_department_data(rnd_data)

Predictions on test set and Confusion Matrix

# Process each department dataset
sales_results <- process_department_data(sales_data)
hr_results <- process_department_data(hr_data)
rnd_results <- process_department_data(rnd_data)

# Print the confusion matrix for the Sales department
print("Confusion Matrix for Sales Department:")
## [1] "Confusion Matrix for Sales Department:"
print(sales_results$evaluation)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1029  116
##          1  301  220
##                                           
##                Accuracy : 0.7497          
##                  95% CI : (0.7282, 0.7703)
##     No Information Rate : 0.7983          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3553          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7737          
##             Specificity : 0.6548          
##          Pos Pred Value : 0.8987          
##          Neg Pred Value : 0.4223          
##              Prevalence : 0.7983          
##          Detection Rate : 0.6176          
##    Detection Prevalence : 0.6873          
##       Balanced Accuracy : 0.7142          
##                                           
##        'Positive' Class : 0               
## 
# Print the confusion matrix for the HR department
print("Confusion Matrix for HR Department:")
## [1] "Confusion Matrix for HR Department:"
print(hr_results$evaluation)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 421   4
##          1 143  90
##                                           
##                Accuracy : 0.7766          
##                  95% CI : (0.7428, 0.8079)
##     No Information Rate : 0.8571          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4355          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7465          
##             Specificity : 0.9574          
##          Pos Pred Value : 0.9906          
##          Neg Pred Value : 0.3863          
##              Prevalence : 0.8571          
##          Detection Rate : 0.6398          
##    Detection Prevalence : 0.6459          
##       Balanced Accuracy : 0.8520          
##                                           
##        'Positive' Class : 0               
## 
# Print the confusion matrix for the R&D department
print("Confusion Matrix for R&D Department:")
## [1] "Confusion Matrix for R&D Department:"
print(rnd_results$evaluation)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2876  220
##          1 1078  446
##                                          
##                Accuracy : 0.719          
##                  95% CI : (0.7058, 0.732)
##     No Information Rate : 0.8558         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.2585         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.7274         
##             Specificity : 0.6697         
##          Pos Pred Value : 0.9289         
##          Neg Pred Value : 0.2927         
##              Prevalence : 0.8558         
##          Detection Rate : 0.6225         
##    Detection Prevalence : 0.6701         
##       Balanced Accuracy : 0.6985         
##                                          
##        'Positive' Class : 0              
## 

Variance Importance for Sales, HR and R&D

# Calculate variable importance for each department's model
sales_var_imp <- varImp(sales_results$model, scale = FALSE)
hr_var_imp <- varImp(hr_results$model, scale = FALSE)
rnd_var_imp <- varImp(rnd_results$model, scale = FALSE)

# Print the variable importance for Sales department
print("Variable Importance for Sales Department:")
## [1] "Variable Importance for Sales Department:"
print(sales_var_imp)
##                             Overall
## Age                      158.917796
## BusinessTravel           107.007484
## DistanceFromHome          83.735931
## Education                 26.841194
## EnvironmentSatisfaction   25.082053
## JobInvolvement            65.167166
## JobSatisfaction           44.118585
## MaritalStatus             52.295666
## MonthlyIncome             95.513794
## NumCompaniesWorked        24.978251
## OverTime                  36.146428
## PercentSalaryHike         35.233779
## PerformanceRating          2.892857
## StockOptionLevel          15.059482
## TrainingTimesLastYear     30.083053
## YearsAtCompany            59.115339
## YearsSinceLastPromotion   16.635839
## Department                 0.000000
## EducationField             0.000000
## Gender                     0.000000
## RelationshipSatisfaction   0.000000
## WorkLifeBalance            0.000000
# Print the variable importance for HR department
print("Variable Importance for HR Department:")
## [1] "Variable Importance for HR Department:"
print(hr_var_imp)
##                            Overall
## Age                      62.367579
## BusinessTravel           22.913872
## DistanceFromHome         69.957259
## Education                16.652117
## EducationField           42.574613
## EnvironmentSatisfaction  63.398729
## JobInvolvement            9.319177
## JobSatisfaction          12.407505
## MaritalStatus             3.758586
## MonthlyIncome            27.340278
## NumCompaniesWorked        2.403565
## PercentSalaryHike        22.899110
## PerformanceRating         3.953970
## RelationshipSatisfaction 15.531891
## StockOptionLevel         29.644549
## TrainingTimesLastYear    43.370503
## YearsAtCompany           29.429738
## YearsSinceLastPromotion   3.508567
## Department                0.000000
## Gender                    0.000000
## OverTime                  0.000000
## WorkLifeBalance           0.000000
# Print the variable importance for R&D department
print("Variable Importance for R&D Department:")
## [1] "Variable Importance for R&D Department:"
print(rnd_var_imp)
##                             Overall
## Age                      160.693968
## BusinessTravel            69.933187
## DistanceFromHome           6.121094
## Education                  7.423855
## JobInvolvement            52.371749
## JobSatisfaction           34.568477
## MaritalStatus             95.858599
## MonthlyIncome            128.497173
## NumCompaniesWorked         8.517866
## OverTime                 185.272132
## PercentSalaryHike         19.806168
## StockOptionLevel         107.883099
## TrainingTimesLastYear     45.172356
## YearsAtCompany            26.572917
## YearsSinceLastPromotion   10.655467
## Department                 0.000000
## EducationField             0.000000
## EnvironmentSatisfaction    0.000000
## Gender                     0.000000
## PerformanceRating          0.000000
## RelationshipSatisfaction   0.000000
## WorkLifeBalance            0.000000
# Plot variable importance for Sales department
plot(sales_var_imp, main="Variable Importance - Sales Department")

# Plot variable importance for HR department
plot(hr_var_imp, main="Variable Importance - HR Department")

# Plot variable importance for R&D department
plot(rnd_var_imp, main="Variable Importance - R&D Department")

# Create a data frame for the Sales department variable importance
sales_data <- data.frame(
  Variable = c("Age", "BusinessTravel", "DailyRate", "DistanceFromHome",
               "Education", "EnvironmentSatisfaction", "HourlyRate",
               "JobInvolvement", "JobLevel", "JobRole"),
  Importance = c(151.418909, 116.449477, 64.651043, 93.528052,
                 7.479869, 15.834588, 61.812332, 62.588595,
                 7.500696, 27.948940)
)

# Plotting
ggplot(sales_data, aes(x = reorder(Variable, -Importance), y = Importance)) +
  geom_col(fill = "steelblue") +
  labs(title = "Variable Importance for Sales Department",
       x = "Variable",
       y = "Importance") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Create a data frame for the HR department variable importance
hr_data <- data.frame(
  Variable = c("Age", "BusinessTravel", "DailyRate", "DistanceFromHome",
               "Education", "EducationField", "EnvironmentSatisfaction",
               "HourlyRate", "JobInvolvement", "JobRole"),
  Importance = c(48.130178, 22.913872, 31.306332, 60.253362,
                 10.869100, 46.443127, 61.366961, 26.498422,
                 5.205372, 2.909091)
)

# Plotting
ggplot(hr_data, aes(x = reorder(Variable, -Importance), y = Importance)) +
  geom_col(fill = "darkgreen") +
  labs(title = "Variable Importance for HR Department",
       x = "Variable",
       y = "Importance") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Create a data frame for the R&D department variable importance
rnd_data <- data.frame(
  Variable = c("Age", "BusinessTravel", "DailyRate", "DistanceFromHome",
               "Education", "JobInvolvement", "JobSatisfaction",
               "MaritalStatus", "MonthlyIncome", "MonthlyRate"),
  Importance = c(133.977348, 69.933187, 104.876065, 6.121094,
                 16.026592, 61.172651, 56.868178, 95.858599,
                 128.497173, 49.547281)
)

# Plotting
ggplot(rnd_data, aes(x = reorder(Variable, -Importance), y = Importance)) +
  geom_col(fill = "firebrick") +
  labs(title = "Variable Importance for R&D Department",
       x = "Variable",
       y = "Importance") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Final Logistic and Decision Tree models and their Confusion Matrix

# Convert predictions and actual data to factors with consistent levels
predicted_classes_factor <- factor(predicted_classes, levels = c(0, 1))
actual_classes_factor <- factor(testset_interaction$Attrition_numeric, levels = c(0, 1))
# Calculate and print the confusion matrix and related statistics for logistic regression
conf_matrix_logistic <- confusionMatrix(data = predicted_classes_factor, reference = actual_classes_factor)
print(conf_matrix_logistic)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 502 236
##          1 266 532
##                                           
##                Accuracy : 0.6732          
##                  95% CI : (0.6491, 0.6966)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.3464          
##                                           
##  Mcnemar's Test P-Value : 0.1956          
##                                           
##             Sensitivity : 0.6536          
##             Specificity : 0.6927          
##          Pos Pred Value : 0.6802          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3268          
##    Detection Prevalence : 0.4805          
##       Balanced Accuracy : 0.6732          
##                                           
##        'Positive' Class : 0               
## 
tree_predictions <- decision_tree_model_cv$pred
predicted_classes_tree <- factor(tree_predictions$pred, levels = c("X0", "X1"))
actual_classes_tree <- factor(tree_predictions$obs, levels = c("X0", "X1"))

# Calculate and print the confusion matrix and related statistics for decision tree
conf_matrix_tree <- confusionMatrix(data = predicted_classes_tree, reference = actual_classes_tree)
print(conf_matrix_tree)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   X0   X1
##         X0 9504 5177
##         X1 4146 8473
##                                           
##                Accuracy : 0.6585          
##                  95% CI : (0.6528, 0.6641)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.317           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6963          
##             Specificity : 0.6207          
##          Pos Pred Value : 0.6474          
##          Neg Pred Value : 0.6714          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3481          
##    Detection Prevalence : 0.5378          
##       Balanced Accuracy : 0.6585          
##                                           
##        'Positive' Class : X0              
## 
# Compute the confusion matrices
conf_matrix_logistic <- confusionMatrix(as.factor(predicted_classes), as.factor(testset_interaction$Attrition_numeric))
conf_matrix_tree <- confusionMatrix(factor(decision_tree_model_cv$pred$pred, levels = c("X0", "X1")),
                                    factor(decision_tree_model_cv$pred$obs, levels = c("X0", "X1")))

# Function to extract metrics from confusion matrix
extract_metrics <- function(cm) {
  data.frame(
    Accuracy = cm$overall['Accuracy'],
    Kappa = cm$overall['Kappa'],
    Precision = cm$byClass['Pos Pred Value'],
    Recall = cm$byClass['Sensitivity'],
    F1 = cm$byClass['F1']
  )
}

# Extract metrics
metrics_logistic <- extract_metrics(conf_matrix_logistic)
metrics_tree <- extract_metrics(conf_matrix_tree)

# Combine metrics into one dataframe for plotting
metrics_df <- rbind(metrics_logistic, metrics_tree)
metrics_df$Model <- c("Logistic Regression", "Decision Tree")
metrics_df <- reshape2::melt(metrics_df, id.vars = "Model")
# Plotting the metrics with additional enhancements
ggplot(metrics_df, aes(x = variable, y = value, fill = Model)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
  geom_text(aes(label = round(value, 2)), 
            position = position_dodge(width = 0.8), 
            vjust = -0.25, 
            size = 3.5) +
  labs(title = "Comparison of Model Metrics", x = "Metric", y = "Value") +
  theme_minimal(base_size = 11.5) +  # Increasing base size for better readability
  scale_fill_brewer(palette = "Set1") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom") +  # Adjust legend position
  theme(plot.title = element_text(size = 20, face = "bold"),  # Enhance title style
        axis.title = element_text(size = 14))  # Enhance axis titles

library(pROC)
library(ggplot2)
library(plotly)

# Function to plot ROC curve and calculate AUC
plot_roc_auc <- function(predicted_probs, actuals, model_name) {
  roc_obj <- roc(actuals, predicted_probs)
  auc_value <- auc(roc_obj)
  
  # Create a data frame for plotting
  roc_data <- data.frame(
    FPR = roc_obj$specificities,
    TPR = roc_obj$sensitivities
  )
  
  # Plot ROC curve using ggplot2
  roc_plot <- ggplot(roc_data, aes(x = FPR, y = TPR)) +
    geom_line(color = ifelse(model_name == "Logistic Regression", "blue", "green"), size = 1.5) +
    geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
    ggtitle(paste("ROC Curve -", model_name)) +
    xlab("False Positive Rate (1 - Specificity)") +
    ylab("True Positive Rate (Sensitivity)") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))
  
  # Convert ggplot to plotly object for interactivity
  roc_plotly <- ggplotly(roc_plot) %>% layout(legend = list(x = 0.8, y = 0.2))  # Adjust legend position
  
  # Add AUC value to the plot
  auc_text <- paste("AUC =", round(auc_value, 2))
  
  return(list(plot = roc_plotly, auc = auc_text))
}

# Calculate predicted probabilities for logistic regression model (using "logistic_model_interaction")
predicted_probs_logistic <- predict(logistic_model_interaction, newdata = testset_interaction, type = "response")
# Plot ROC curve and calculate AUC for logistic regression model
roc_logistic <- plot_roc_auc(predicted_probs_logistic, testset_interaction$Attrition_numeric, "Logistic Regression")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Calculate predicted probabilities for decision tree model
predicted_probs_tree <- predict(decision_tree_model_cv, newdata = testset_interaction, type = "prob")[, "X1"]
# Plot ROC curve and calculate AUC for decision tree model
roc_tree <- plot_roc_auc(predicted_probs_tree, testset_interaction$Attrition_numeric, "Decision Tree")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_logistic$auc  # Print AUC for logistic regression model
## [1] "AUC = 0.73"
roc_tree$auc  # Print AUC for decision tree model
## [1] "AUC = 0.65"
roc_logistic$plot  # Display ROC plot for logistic regression model
roc_tree$plot  # Display ROC plot for decision tree model